8 static uint64_t load_3(const unsigned char *in) {
12 result |= shlu64(in[1], 8);
13 result |= shlu64(in[2], 16);
18 static uint64_t load_4(const unsigned char *in) {
22 result |= shlu64(in[1], 8);
23 result |= shlu64(in[2], 16);
24 result |= shlu64(in[3], 24);
71 Can overlap h with f or g.
74 |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
75 |g| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
78 |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
81 void fe_add(fe h, const fe f, const fe g) {
102 int32_t h0 = f0 + g0;
103 int32_t h1 = f1 + g1;
104 int32_t h2 = f2 + g2;
105 int32_t h3 = f3 + g3;
106 int32_t h4 = f4 + g4;
107 int32_t h5 = f5 + g5;
108 int32_t h6 = f6 + g6;
109 int32_t h7 = f7 + g7;
110 int32_t h8 = f8 + g8;
111 int32_t h9 = f9 + g9;
128 Replace (f,g) with (g,g) if b == 1;
129 replace (f,g) with (f,g) if b == 0.
131 Preconditions: b in {0,1}.
134 void fe_cmov(fe f, const fe g, unsigned int b) {
155 int32_t x0 = f0 ^ g0;
156 int32_t x1 = f1 ^ g1;
157 int32_t x2 = f2 ^ g2;
158 int32_t x3 = f3 ^ g3;
159 int32_t x4 = f4 ^ g4;
160 int32_t x5 = f5 ^ g5;
161 int32_t x6 = f6 ^ g6;
162 int32_t x7 = f7 ^ g7;
163 int32_t x8 = f8 ^ g8;
164 int32_t x9 = f9 ^ g9;
166 b = (unsigned int) (- (int) b); /* silence warning */
191 Replace (f,g) with (g,f) if b == 1;
192 replace (f,g) with (f,g) if b == 0.
194 Preconditions: b in {0,1}.
197 void fe_cswap(fe f,fe g,unsigned int b) {
218 int32_t x0 = f0 ^ g0;
219 int32_t x1 = f1 ^ g1;
220 int32_t x2 = f2 ^ g2;
221 int32_t x3 = f3 ^ g3;
222 int32_t x4 = f4 ^ g4;
223 int32_t x5 = f5 ^ g5;
224 int32_t x6 = f6 ^ g6;
225 int32_t x7 = f7 ^ g7;
226 int32_t x8 = f8 ^ g8;
227 int32_t x9 = f9 ^ g9;
267 void fe_copy(fe h, const fe f) {
294 Ignores top bit of h.
297 void fe_frombytes(fe h, const unsigned char *s) {
298 int64_t h0 = load_4(s);
299 int64_t h1 = load_3(s + 4) << 6;
300 int64_t h2 = load_3(s + 7) << 5;
301 int64_t h3 = load_3(s + 10) << 3;
302 int64_t h4 = load_3(s + 13) << 2;
303 int64_t h5 = load_4(s + 16);
304 int64_t h6 = load_3(s + 20) << 7;
305 int64_t h7 = load_3(s + 23) << 5;
306 int64_t h8 = load_3(s + 26) << 4;
307 int64_t h9 = (load_3(s + 29) & 8388607) << 2;
319 carry9 = (h9 + (1L << 24)) >> 25;
321 h9 -= shl64(carry9, 25);
322 carry1 = (h1 + (1L << 24)) >> 25;
324 h1 -= shl64(carry1, 25);
325 carry3 = (h3 + (1L << 24)) >> 25;
327 h3 -= shl64(carry3, 25);
328 carry5 = (h5 + (1L << 24)) >> 25;
330 h5 -= shl64(carry5, 25);
331 carry7 = (h7 + (1L << 24)) >> 25;
333 h7 -= shl64(carry7, 25);
334 carry0 = (h0 + (1L << 25)) >> 26;
336 h0 -= shl64(carry0, 26);
337 carry2 = (h2 + (1L << 25)) >> 26;
339 h2 -= shl64(carry2, 26);
340 carry4 = (h4 + (1L << 25)) >> 26;
342 h4 -= shl64(carry4, 26);
343 carry6 = (h6 + (1L << 25)) >> 26;
345 h6 -= shl64(carry6, 26);
346 carry8 = (h8 + (1L << 25)) >> 26;
348 h8 -= shl64(carry8, 26);
364 void fe_invert(fe out, const fe z) {
373 for (i = 1; i < 1; ++i) {
379 for (i = 1; i < 2; ++i) {
387 for (i = 1; i < 1; ++i) {
394 for (i = 1; i < 5; ++i) {
401 for (i = 1; i < 10; ++i) {
408 for (i = 1; i < 20; ++i) {
415 for (i = 1; i < 10; ++i) {
422 for (i = 1; i < 50; ++i) {
429 for (i = 1; i < 100; ++i) {
436 for (i = 1; i < 50; ++i) {
443 for (i = 1; i < 5; ++i) {
453 return 1 if f is in {1,3,5,...,q-2}
454 return 0 if f is in {0,2,4,...,q-1}
457 |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
460 int fe_isnegative(const fe f) {
475 |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
478 int fe_isnonzero(const fe f) {
485 #define F(i) r |= s[i]
526 Can overlap h with f or g.
529 |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
530 |g| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
533 |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
537 Notes on implementation strategy:
539 Using schoolbook multiplication.
540 Karatsuba would save a little in some cost models.
542 Most multiplications by 2 and 19 are 32-bit precomputations;
543 cheaper than 64-bit postcomputations.
545 There is one remaining multiplication by 19 in the carry chain;
546 one *19 precomputation can be merged into this,
547 but the resulting data flow is considerably less clean.
549 There are 12 carries below.
550 10 of them are 2-way parallelizable and vectorizable.
551 Can get away with 11 carries, but then data flow is much deeper.
553 With tighter constraints on inputs can squeeze carries into int32.
556 void fe_mul(fe h, const fe f, const fe g) {
577 int32_t g1_19 = 19 * g1; /* 1.959375*2^29 */
578 int32_t g2_19 = 19 * g2; /* 1.959375*2^30; still ok */
579 int32_t g3_19 = 19 * g3;
580 int32_t g4_19 = 19 * g4;
581 int32_t g5_19 = 19 * g5;
582 int32_t g6_19 = 19 * g6;
583 int32_t g7_19 = 19 * g7;
584 int32_t g8_19 = 19 * g8;
585 int32_t g9_19 = 19 * g9;
586 int32_t f1_2 = 2 * f1;
587 int32_t f3_2 = 2 * f3;
588 int32_t f5_2 = 2 * f5;
589 int32_t f7_2 = 2 * f7;
590 int32_t f9_2 = 2 * f9;
591 int64_t f0g0 = f0 * (int64_t) g0;
592 int64_t f0g1 = f0 * (int64_t) g1;
593 int64_t f0g2 = f0 * (int64_t) g2;
594 int64_t f0g3 = f0 * (int64_t) g3;
595 int64_t f0g4 = f0 * (int64_t) g4;
596 int64_t f0g5 = f0 * (int64_t) g5;
597 int64_t f0g6 = f0 * (int64_t) g6;
598 int64_t f0g7 = f0 * (int64_t) g7;
599 int64_t f0g8 = f0 * (int64_t) g8;
600 int64_t f0g9 = f0 * (int64_t) g9;
601 int64_t f1g0 = f1 * (int64_t) g0;
602 int64_t f1g1_2 = f1_2 * (int64_t) g1;
603 int64_t f1g2 = f1 * (int64_t) g2;
604 int64_t f1g3_2 = f1_2 * (int64_t) g3;
605 int64_t f1g4 = f1 * (int64_t) g4;
606 int64_t f1g5_2 = f1_2 * (int64_t) g5;
607 int64_t f1g6 = f1 * (int64_t) g6;
608 int64_t f1g7_2 = f1_2 * (int64_t) g7;
609 int64_t f1g8 = f1 * (int64_t) g8;
610 int64_t f1g9_38 = f1_2 * (int64_t) g9_19;
611 int64_t f2g0 = f2 * (int64_t) g0;
612 int64_t f2g1 = f2 * (int64_t) g1;
613 int64_t f2g2 = f2 * (int64_t) g2;
614 int64_t f2g3 = f2 * (int64_t) g3;
615 int64_t f2g4 = f2 * (int64_t) g4;
616 int64_t f2g5 = f2 * (int64_t) g5;
617 int64_t f2g6 = f2 * (int64_t) g6;
618 int64_t f2g7 = f2 * (int64_t) g7;
619 int64_t f2g8_19 = f2 * (int64_t) g8_19;
620 int64_t f2g9_19 = f2 * (int64_t) g9_19;
621 int64_t f3g0 = f3 * (int64_t) g0;
622 int64_t f3g1_2 = f3_2 * (int64_t) g1;
623 int64_t f3g2 = f3 * (int64_t) g2;
624 int64_t f3g3_2 = f3_2 * (int64_t) g3;
625 int64_t f3g4 = f3 * (int64_t) g4;
626 int64_t f3g5_2 = f3_2 * (int64_t) g5;
627 int64_t f3g6 = f3 * (int64_t) g6;
628 int64_t f3g7_38 = f3_2 * (int64_t) g7_19;
629 int64_t f3g8_19 = f3 * (int64_t) g8_19;
630 int64_t f3g9_38 = f3_2 * (int64_t) g9_19;
631 int64_t f4g0 = f4 * (int64_t) g0;
632 int64_t f4g1 = f4 * (int64_t) g1;
633 int64_t f4g2 = f4 * (int64_t) g2;
634 int64_t f4g3 = f4 * (int64_t) g3;
635 int64_t f4g4 = f4 * (int64_t) g4;
636 int64_t f4g5 = f4 * (int64_t) g5;
637 int64_t f4g6_19 = f4 * (int64_t) g6_19;
638 int64_t f4g7_19 = f4 * (int64_t) g7_19;
639 int64_t f4g8_19 = f4 * (int64_t) g8_19;
640 int64_t f4g9_19 = f4 * (int64_t) g9_19;
641 int64_t f5g0 = f5 * (int64_t) g0;
642 int64_t f5g1_2 = f5_2 * (int64_t) g1;
643 int64_t f5g2 = f5 * (int64_t) g2;
644 int64_t f5g3_2 = f5_2 * (int64_t) g3;
645 int64_t f5g4 = f5 * (int64_t) g4;
646 int64_t f5g5_38 = f5_2 * (int64_t) g5_19;
647 int64_t f5g6_19 = f5 * (int64_t) g6_19;
648 int64_t f5g7_38 = f5_2 * (int64_t) g7_19;
649 int64_t f5g8_19 = f5 * (int64_t) g8_19;
650 int64_t f5g9_38 = f5_2 * (int64_t) g9_19;
651 int64_t f6g0 = f6 * (int64_t) g0;
652 int64_t f6g1 = f6 * (int64_t) g1;
653 int64_t f6g2 = f6 * (int64_t) g2;
654 int64_t f6g3 = f6 * (int64_t) g3;
655 int64_t f6g4_19 = f6 * (int64_t) g4_19;
656 int64_t f6g5_19 = f6 * (int64_t) g5_19;
657 int64_t f6g6_19 = f6 * (int64_t) g6_19;
658 int64_t f6g7_19 = f6 * (int64_t) g7_19;
659 int64_t f6g8_19 = f6 * (int64_t) g8_19;
660 int64_t f6g9_19 = f6 * (int64_t) g9_19;
661 int64_t f7g0 = f7 * (int64_t) g0;
662 int64_t f7g1_2 = f7_2 * (int64_t) g1;
663 int64_t f7g2 = f7 * (int64_t) g2;
664 int64_t f7g3_38 = f7_2 * (int64_t) g3_19;
665 int64_t f7g4_19 = f7 * (int64_t) g4_19;
666 int64_t f7g5_38 = f7_2 * (int64_t) g5_19;
667 int64_t f7g6_19 = f7 * (int64_t) g6_19;
668 int64_t f7g7_38 = f7_2 * (int64_t) g7_19;
669 int64_t f7g8_19 = f7 * (int64_t) g8_19;
670 int64_t f7g9_38 = f7_2 * (int64_t) g9_19;
671 int64_t f8g0 = f8 * (int64_t) g0;
672 int64_t f8g1 = f8 * (int64_t) g1;
673 int64_t f8g2_19 = f8 * (int64_t) g2_19;
674 int64_t f8g3_19 = f8 * (int64_t) g3_19;
675 int64_t f8g4_19 = f8 * (int64_t) g4_19;
676 int64_t f8g5_19 = f8 * (int64_t) g5_19;
677 int64_t f8g6_19 = f8 * (int64_t) g6_19;
678 int64_t f8g7_19 = f8 * (int64_t) g7_19;
679 int64_t f8g8_19 = f8 * (int64_t) g8_19;
680 int64_t f8g9_19 = f8 * (int64_t) g9_19;
681 int64_t f9g0 = f9 * (int64_t) g0;
682 int64_t f9g1_38 = f9_2 * (int64_t) g1_19;
683 int64_t f9g2_19 = f9 * (int64_t) g2_19;
684 int64_t f9g3_38 = f9_2 * (int64_t) g3_19;
685 int64_t f9g4_19 = f9 * (int64_t) g4_19;
686 int64_t f9g5_38 = f9_2 * (int64_t) g5_19;
687 int64_t f9g6_19 = f9 * (int64_t) g6_19;
688 int64_t f9g7_38 = f9_2 * (int64_t) g7_19;
689 int64_t f9g8_19 = f9 * (int64_t) g8_19;
690 int64_t f9g9_38 = f9_2 * (int64_t) g9_19;
691 int64_t h0 = f0g0 + f1g9_38 + f2g8_19 + f3g7_38 + f4g6_19 + f5g5_38 + f6g4_19 + f7g3_38 + f8g2_19 + f9g1_38;
692 int64_t h1 = f0g1 + f1g0 + f2g9_19 + f3g8_19 + f4g7_19 + f5g6_19 + f6g5_19 + f7g4_19 + f8g3_19 + f9g2_19;
693 int64_t h2 = f0g2 + f1g1_2 + f2g0 + f3g9_38 + f4g8_19 + f5g7_38 + f6g6_19 + f7g5_38 + f8g4_19 + f9g3_38;
694 int64_t h3 = f0g3 + f1g2 + f2g1 + f3g0 + f4g9_19 + f5g8_19 + f6g7_19 + f7g6_19 + f8g5_19 + f9g4_19;
695 int64_t h4 = f0g4 + f1g3_2 + f2g2 + f3g1_2 + f4g0 + f5g9_38 + f6g8_19 + f7g7_38 + f8g6_19 + f9g5_38;
696 int64_t h5 = f0g5 + f1g4 + f2g3 + f3g2 + f4g1 + f5g0 + f6g9_19 + f7g8_19 + f8g7_19 + f9g6_19;
697 int64_t h6 = f0g6 + f1g5_2 + f2g4 + f3g3_2 + f4g2 + f5g1_2 + f6g0 + f7g9_38 + f8g8_19 + f9g7_38;
698 int64_t h7 = f0g7 + f1g6 + f2g5 + f3g4 + f4g3 + f5g2 + f6g1 + f7g0 + f8g9_19 + f9g8_19;
699 int64_t h8 = f0g8 + f1g7_2 + f2g6 + f3g5_2 + f4g4 + f5g3_2 + f6g2 + f7g1_2 + f8g0 + f9g9_38;
700 int64_t h9 = f0g9 + f1g8 + f2g7 + f3g6 + f4g5 + f5g4 + f6g3 + f7g2 + f8g1 + f9g0 ;
712 carry0 = (h0 + (1L << 25)) >> 26;
714 h0 -= shl64(carry0, 26);
715 carry4 = (h4 + (1L << 25)) >> 26;
717 h4 -= shl64(carry4, 26);
719 carry1 = (h1 + (1L << 24)) >> 25;
721 h1 -= shl64(carry1, 25);
722 carry5 = (h5 + (1L << 24)) >> 25;
724 h5 -= shl64(carry5, 25);
726 carry2 = (h2 + (1L << 25)) >> 26;
728 h2 -= shl64(carry2, 26);
729 carry6 = (h6 + (1L << 25)) >> 26;
731 h6 -= shl64(carry6, 26);
733 carry3 = (h3 + (1L << 24)) >> 25;
735 h3 -= shl64(carry3, 25);
736 carry7 = (h7 + (1L << 24)) >> 25;
738 h7 -= shl64(carry7, 25);
740 carry4 = (h4 + (1L << 25)) >> 26;
742 h4 -= shl64(carry4, 26);
743 carry8 = (h8 + (1L << 25)) >> 26;
745 h8 -= shl64(carry8, 26);
747 carry9 = (h9 + (1L << 24)) >> 25;
749 h9 -= shl64(carry9, 25);
751 carry0 = (h0 + (1L << 25)) >> 26;
753 h0 -= shl64(carry0, 26);
770 Can overlap h with f.
773 |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
776 |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
779 void fe_mul121666(fe h, fe f) {
790 int64_t h0 = f0 * (int64_t) 121666;
791 int64_t h1 = f1 * (int64_t) 121666;
792 int64_t h2 = f2 * (int64_t) 121666;
793 int64_t h3 = f3 * (int64_t) 121666;
794 int64_t h4 = f4 * (int64_t) 121666;
795 int64_t h5 = f5 * (int64_t) 121666;
796 int64_t h6 = f6 * (int64_t) 121666;
797 int64_t h7 = f7 * (int64_t) 121666;
798 int64_t h8 = f8 * (int64_t) 121666;
799 int64_t h9 = f9 * (int64_t) 121666;
811 carry9 = (h9 + (int64_t) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= shl64(carry9, 25);
812 carry1 = (h1 + (int64_t) (1<<24)) >> 25; h2 += carry1; h1 -= shl64(carry1, 25);
813 carry3 = (h3 + (int64_t) (1<<24)) >> 25; h4 += carry3; h3 -= shl64(carry3, 25);
814 carry5 = (h5 + (int64_t) (1<<24)) >> 25; h6 += carry5; h5 -= shl64(carry5, 25);
815 carry7 = (h7 + (int64_t) (1<<24)) >> 25; h8 += carry7; h7 -= shl64(carry7, 25);
817 carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= shl64(carry0, 26);
818 carry2 = (h2 + (int64_t) (1<<25)) >> 26; h3 += carry2; h2 -= shl64(carry2, 26);
819 carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= shl64(carry4, 26);
820 carry6 = (h6 + (int64_t) (1<<25)) >> 26; h7 += carry6; h6 -= shl64(carry6, 26);
821 carry8 = (h8 + (int64_t) (1<<25)) >> 26; h9 += carry8; h8 -= shl64(carry8, 26);
840 |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
843 |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
846 void fe_neg(fe h, const fe f) {
881 void fe_pow22523(fe out, const fe z) {
888 for (i = 1; i < 1; ++i) {
894 for (i = 1; i < 2; ++i) {
902 for (i = 1; i < 1; ++i) {
909 for (i = 1; i < 5; ++i) {
916 for (i = 1; i < 10; ++i) {
923 for (i = 1; i < 20; ++i) {
930 for (i = 1; i < 10; ++i) {
937 for (i = 1; i < 50; ++i) {
944 for (i = 1; i < 100; ++i) {
951 for (i = 1; i < 50; ++i) {
958 for (i = 1; i < 2; ++i) {
969 Can overlap h with f.
972 |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
975 |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
979 See fe_mul.c for discussion of implementation strategy.
982 void fe_sq(fe h, const fe f) {
993 int32_t f0_2 = 2 * f0;
994 int32_t f1_2 = 2 * f1;
995 int32_t f2_2 = 2 * f2;
996 int32_t f3_2 = 2 * f3;
997 int32_t f4_2 = 2 * f4;
998 int32_t f5_2 = 2 * f5;
999 int32_t f6_2 = 2 * f6;
1000 int32_t f7_2 = 2 * f7;
1001 int32_t f5_38 = 38 * f5; /* 1.959375*2^30 */
1002 int32_t f6_19 = 19 * f6; /* 1.959375*2^30 */
1003 int32_t f7_38 = 38 * f7; /* 1.959375*2^30 */
1004 int32_t f8_19 = 19 * f8; /* 1.959375*2^30 */
1005 int32_t f9_38 = 38 * f9; /* 1.959375*2^30 */
1006 int64_t f0f0 = f0 * (int64_t) f0;
1007 int64_t f0f1_2 = f0_2 * (int64_t) f1;
1008 int64_t f0f2_2 = f0_2 * (int64_t) f2;
1009 int64_t f0f3_2 = f0_2 * (int64_t) f3;
1010 int64_t f0f4_2 = f0_2 * (int64_t) f4;
1011 int64_t f0f5_2 = f0_2 * (int64_t) f5;
1012 int64_t f0f6_2 = f0_2 * (int64_t) f6;
1013 int64_t f0f7_2 = f0_2 * (int64_t) f7;
1014 int64_t f0f8_2 = f0_2 * (int64_t) f8;
1015 int64_t f0f9_2 = f0_2 * (int64_t) f9;
1016 int64_t f1f1_2 = f1_2 * (int64_t) f1;
1017 int64_t f1f2_2 = f1_2 * (int64_t) f2;
1018 int64_t f1f3_4 = f1_2 * (int64_t) f3_2;
1019 int64_t f1f4_2 = f1_2 * (int64_t) f4;
1020 int64_t f1f5_4 = f1_2 * (int64_t) f5_2;
1021 int64_t f1f6_2 = f1_2 * (int64_t) f6;
1022 int64_t f1f7_4 = f1_2 * (int64_t) f7_2;
1023 int64_t f1f8_2 = f1_2 * (int64_t) f8;
1024 int64_t f1f9_76 = f1_2 * (int64_t) f9_38;
1025 int64_t f2f2 = f2 * (int64_t) f2;
1026 int64_t f2f3_2 = f2_2 * (int64_t) f3;
1027 int64_t f2f4_2 = f2_2 * (int64_t) f4;
1028 int64_t f2f5_2 = f2_2 * (int64_t) f5;
1029 int64_t f2f6_2 = f2_2 * (int64_t) f6;
1030 int64_t f2f7_2 = f2_2 * (int64_t) f7;
1031 int64_t f2f8_38 = f2_2 * (int64_t) f8_19;
1032 int64_t f2f9_38 = f2 * (int64_t) f9_38;
1033 int64_t f3f3_2 = f3_2 * (int64_t) f3;
1034 int64_t f3f4_2 = f3_2 * (int64_t) f4;
1035 int64_t f3f5_4 = f3_2 * (int64_t) f5_2;
1036 int64_t f3f6_2 = f3_2 * (int64_t) f6;
1037 int64_t f3f7_76 = f3_2 * (int64_t) f7_38;
1038 int64_t f3f8_38 = f3_2 * (int64_t) f8_19;
1039 int64_t f3f9_76 = f3_2 * (int64_t) f9_38;
1040 int64_t f4f4 = f4 * (int64_t) f4;
1041 int64_t f4f5_2 = f4_2 * (int64_t) f5;
1042 int64_t f4f6_38 = f4_2 * (int64_t) f6_19;
1043 int64_t f4f7_38 = f4 * (int64_t) f7_38;
1044 int64_t f4f8_38 = f4_2 * (int64_t) f8_19;
1045 int64_t f4f9_38 = f4 * (int64_t) f9_38;
1046 int64_t f5f5_38 = f5 * (int64_t) f5_38;
1047 int64_t f5f6_38 = f5_2 * (int64_t) f6_19;
1048 int64_t f5f7_76 = f5_2 * (int64_t) f7_38;
1049 int64_t f5f8_38 = f5_2 * (int64_t) f8_19;
1050 int64_t f5f9_76 = f5_2 * (int64_t) f9_38;
1051 int64_t f6f6_19 = f6 * (int64_t) f6_19;
1052 int64_t f6f7_38 = f6 * (int64_t) f7_38;
1053 int64_t f6f8_38 = f6_2 * (int64_t) f8_19;
1054 int64_t f6f9_38 = f6 * (int64_t) f9_38;
1055 int64_t f7f7_38 = f7 * (int64_t) f7_38;
1056 int64_t f7f8_38 = f7_2 * (int64_t) f8_19;
1057 int64_t f7f9_76 = f7_2 * (int64_t) f9_38;
1058 int64_t f8f8_19 = f8 * (int64_t) f8_19;
1059 int64_t f8f9_38 = f8 * (int64_t) f9_38;
1060 int64_t f9f9_38 = f9 * (int64_t) f9_38;
1061 int64_t h0 = f0f0 + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38;
1062 int64_t h1 = f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38;
1063 int64_t h2 = f0f2_2 + f1f1_2 + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19;
1064 int64_t h3 = f0f3_2 + f1f2_2 + f4f9_38 + f5f8_38 + f6f7_38;
1065 int64_t h4 = f0f4_2 + f1f3_4 + f2f2 + f5f9_76 + f6f8_38 + f7f7_38;
1066 int64_t h5 = f0f5_2 + f1f4_2 + f2f3_2 + f6f9_38 + f7f8_38;
1067 int64_t h6 = f0f6_2 + f1f5_4 + f2f4_2 + f3f3_2 + f7f9_76 + f8f8_19;
1068 int64_t h7 = f0f7_2 + f1f6_2 + f2f5_2 + f3f4_2 + f8f9_38;
1069 int64_t h8 = f0f8_2 + f1f7_4 + f2f6_2 + f3f5_4 + f4f4 + f9f9_38;
1070 int64_t h9 = f0f9_2 + f1f8_2 + f2f7_2 + f3f6_2 + f4f5_2;
1081 carry0 = (h0 + (1L << 25)) >> 26;
1083 h0 -= shl64(carry0, 26);
1084 carry4 = (h4 + (1L << 25)) >> 26;
1086 h4 -= shl64(carry4, 26);
1087 carry1 = (h1 + (1L << 24)) >> 25;
1089 h1 -= shl64(carry1, 25);
1090 carry5 = (h5 + (1L << 24)) >> 25;
1092 h5 -= shl64(carry5, 25);
1093 carry2 = (h2 + (1L << 25)) >> 26;
1095 h2 -= shl64(carry2, 26);
1096 carry6 = (h6 + (1L << 25)) >> 26;
1098 h6 -= shl64(carry6, 26);
1099 carry3 = (h3 + (1L << 24)) >> 25;
1101 h3 -= shl64(carry3, 25);
1102 carry7 = (h7 + (1L << 24)) >> 25;
1104 h7 -= shl64(carry7, 25);
1105 carry4 = (h4 + (1L << 25)) >> 26;
1107 h4 -= shl64(carry4, 26);
1108 carry8 = (h8 + (1L << 25)) >> 26;
1110 h8 -= shl64(carry8, 26);
1111 carry9 = (h9 + (1L << 24)) >> 25;
1113 h9 -= shl64(carry9, 25);
1114 carry0 = (h0 + (1L << 25)) >> 26;
1116 h0 -= shl64(carry0, 26);
1117 h[0] = (int32_t) h0;
1118 h[1] = (int32_t) h1;
1119 h[2] = (int32_t) h2;
1120 h[3] = (int32_t) h3;
1121 h[4] = (int32_t) h4;
1122 h[5] = (int32_t) h5;
1123 h[6] = (int32_t) h6;
1124 h[7] = (int32_t) h7;
1125 h[8] = (int32_t) h8;
1126 h[9] = (int32_t) h9;
1132 Can overlap h with f.
1135 |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
1138 |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
1142 See fe_mul.c for discussion of implementation strategy.
1145 void fe_sq2(fe h, const fe f) {
1156 int32_t f0_2 = 2 * f0;
1157 int32_t f1_2 = 2 * f1;
1158 int32_t f2_2 = 2 * f2;
1159 int32_t f3_2 = 2 * f3;
1160 int32_t f4_2 = 2 * f4;
1161 int32_t f5_2 = 2 * f5;
1162 int32_t f6_2 = 2 * f6;
1163 int32_t f7_2 = 2 * f7;
1164 int32_t f5_38 = 38 * f5; /* 1.959375*2^30 */
1165 int32_t f6_19 = 19 * f6; /* 1.959375*2^30 */
1166 int32_t f7_38 = 38 * f7; /* 1.959375*2^30 */
1167 int32_t f8_19 = 19 * f8; /* 1.959375*2^30 */
1168 int32_t f9_38 = 38 * f9; /* 1.959375*2^30 */
1169 int64_t f0f0 = f0 * (int64_t) f0;
1170 int64_t f0f1_2 = f0_2 * (int64_t) f1;
1171 int64_t f0f2_2 = f0_2 * (int64_t) f2;
1172 int64_t f0f3_2 = f0_2 * (int64_t) f3;
1173 int64_t f0f4_2 = f0_2 * (int64_t) f4;
1174 int64_t f0f5_2 = f0_2 * (int64_t) f5;
1175 int64_t f0f6_2 = f0_2 * (int64_t) f6;
1176 int64_t f0f7_2 = f0_2 * (int64_t) f7;
1177 int64_t f0f8_2 = f0_2 * (int64_t) f8;
1178 int64_t f0f9_2 = f0_2 * (int64_t) f9;
1179 int64_t f1f1_2 = f1_2 * (int64_t) f1;
1180 int64_t f1f2_2 = f1_2 * (int64_t) f2;
1181 int64_t f1f3_4 = f1_2 * (int64_t) f3_2;
1182 int64_t f1f4_2 = f1_2 * (int64_t) f4;
1183 int64_t f1f5_4 = f1_2 * (int64_t) f5_2;
1184 int64_t f1f6_2 = f1_2 * (int64_t) f6;
1185 int64_t f1f7_4 = f1_2 * (int64_t) f7_2;
1186 int64_t f1f8_2 = f1_2 * (int64_t) f8;
1187 int64_t f1f9_76 = f1_2 * (int64_t) f9_38;
1188 int64_t f2f2 = f2 * (int64_t) f2;
1189 int64_t f2f3_2 = f2_2 * (int64_t) f3;
1190 int64_t f2f4_2 = f2_2 * (int64_t) f4;
1191 int64_t f2f5_2 = f2_2 * (int64_t) f5;
1192 int64_t f2f6_2 = f2_2 * (int64_t) f6;
1193 int64_t f2f7_2 = f2_2 * (int64_t) f7;
1194 int64_t f2f8_38 = f2_2 * (int64_t) f8_19;
1195 int64_t f2f9_38 = f2 * (int64_t) f9_38;
1196 int64_t f3f3_2 = f3_2 * (int64_t) f3;
1197 int64_t f3f4_2 = f3_2 * (int64_t) f4;
1198 int64_t f3f5_4 = f3_2 * (int64_t) f5_2;
1199 int64_t f3f6_2 = f3_2 * (int64_t) f6;
1200 int64_t f3f7_76 = f3_2 * (int64_t) f7_38;
1201 int64_t f3f8_38 = f3_2 * (int64_t) f8_19;
1202 int64_t f3f9_76 = f3_2 * (int64_t) f9_38;
1203 int64_t f4f4 = f4 * (int64_t) f4;
1204 int64_t f4f5_2 = f4_2 * (int64_t) f5;
1205 int64_t f4f6_38 = f4_2 * (int64_t) f6_19;
1206 int64_t f4f7_38 = f4 * (int64_t) f7_38;
1207 int64_t f4f8_38 = f4_2 * (int64_t) f8_19;
1208 int64_t f4f9_38 = f4 * (int64_t) f9_38;
1209 int64_t f5f5_38 = f5 * (int64_t) f5_38;
1210 int64_t f5f6_38 = f5_2 * (int64_t) f6_19;
1211 int64_t f5f7_76 = f5_2 * (int64_t) f7_38;
1212 int64_t f5f8_38 = f5_2 * (int64_t) f8_19;
1213 int64_t f5f9_76 = f5_2 * (int64_t) f9_38;
1214 int64_t f6f6_19 = f6 * (int64_t) f6_19;
1215 int64_t f6f7_38 = f6 * (int64_t) f7_38;
1216 int64_t f6f8_38 = f6_2 * (int64_t) f8_19;
1217 int64_t f6f9_38 = f6 * (int64_t) f9_38;
1218 int64_t f7f7_38 = f7 * (int64_t) f7_38;
1219 int64_t f7f8_38 = f7_2 * (int64_t) f8_19;
1220 int64_t f7f9_76 = f7_2 * (int64_t) f9_38;
1221 int64_t f8f8_19 = f8 * (int64_t) f8_19;
1222 int64_t f8f9_38 = f8 * (int64_t) f9_38;
1223 int64_t f9f9_38 = f9 * (int64_t) f9_38;
1224 int64_t h0 = f0f0 + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38;
1225 int64_t h1 = f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38;
1226 int64_t h2 = f0f2_2 + f1f1_2 + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19;
1227 int64_t h3 = f0f3_2 + f1f2_2 + f4f9_38 + f5f8_38 + f6f7_38;
1228 int64_t h4 = f0f4_2 + f1f3_4 + f2f2 + f5f9_76 + f6f8_38 + f7f7_38;
1229 int64_t h5 = f0f5_2 + f1f4_2 + f2f3_2 + f6f9_38 + f7f8_38;
1230 int64_t h6 = f0f6_2 + f1f5_4 + f2f4_2 + f3f3_2 + f7f9_76 + f8f8_19;
1231 int64_t h7 = f0f7_2 + f1f6_2 + f2f5_2 + f3f4_2 + f8f9_38;
1232 int64_t h8 = f0f8_2 + f1f7_4 + f2f6_2 + f3f5_4 + f4f4 + f9f9_38;
1233 int64_t h9 = f0f9_2 + f1f8_2 + f2f7_2 + f3f6_2 + f4f5_2;
1254 carry0 = (h0 + (1L << 25)) >> 26;
1256 h0 -= shl64(carry0, 26);
1257 carry4 = (h4 + (1L << 25)) >> 26;
1259 h4 -= shl64(carry4, 26);
1260 carry1 = (h1 + (1L << 24)) >> 25;
1262 h1 -= shl64(carry1, 25);
1263 carry5 = (h5 + (1L << 24)) >> 25;
1265 h5 -= shl64(carry5, 25);
1266 carry2 = (h2 + (1L << 25)) >> 26;
1268 h2 -= shl64(carry2, 26);
1269 carry6 = (h6 + (1L << 25)) >> 26;
1271 h6 -= shl64(carry6, 26);
1272 carry3 = (h3 + (1L << 24)) >> 25;
1274 h3 -= shl64(carry3, 25);
1275 carry7 = (h7 + (1L << 24)) >> 25;
1277 h7 -= shl64(carry7, 25);
1278 carry4 = (h4 + (1L << 25)) >> 26;
1280 h4 -= shl64(carry4, 26);
1281 carry8 = (h8 + (1L << 25)) >> 26;
1283 h8 -= shl64(carry8, 26);
1284 carry9 = (h9 + (1L << 24)) >> 25;
1286 h9 -= shl64(carry9, 25);
1287 carry0 = (h0 + (1L << 25)) >> 26;
1289 h0 -= shl64(carry0, 26);
1290 h[0] = (int32_t) h0;
1291 h[1] = (int32_t) h1;
1292 h[2] = (int32_t) h2;
1293 h[3] = (int32_t) h3;
1294 h[4] = (int32_t) h4;
1295 h[5] = (int32_t) h5;
1296 h[6] = (int32_t) h6;
1297 h[7] = (int32_t) h7;
1298 h[8] = (int32_t) h8;
1299 h[9] = (int32_t) h9;
1305 Can overlap h with f or g.
1308 |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
1309 |g| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
1312 |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
1315 void fe_sub(fe h, const fe f, const fe g) {
1336 int32_t h0 = f0 - g0;
1337 int32_t h1 = f1 - g1;
1338 int32_t h2 = f2 - g2;
1339 int32_t h3 = f3 - g3;
1340 int32_t h4 = f4 - g4;
1341 int32_t h5 = f5 - g5;
1342 int32_t h6 = f6 - g6;
1343 int32_t h7 = f7 - g7;
1344 int32_t h8 = f8 - g8;
1345 int32_t h9 = f9 - g9;
1363 |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
1365 Write p=2^255-19; q=floor(h/p).
1366 Basic claim: q = floor(2^(-255)(h + 19 2^(-25)h9 + 2^(-1))).
1369 Have |h|<=p so |q|<=1 so |19^2 2^(-255) q|<1/4.
1370 Also have |h-2^230 h9|<2^231 so |19 2^(-255)(h-2^230 h9)|<1/4.
1372 Write y=2^(-1)-19^2 2^(-255)q-19 2^(-255)(h-2^230 h9).
1376 Have 0<=r<=p-1=2^255-20.
1377 Thus 0<=r+19(2^-255)r<r+19(2^-255)2^255<=2^255-1.
1379 Write x=r+19(2^-255)r+y.
1380 Then 0<x<2^255 so floor(2^(-255)x) = 0 so floor(q+2^(-255)x) = q.
1382 Have q+2^(-255)x = 2^(-255)(h + 19 2^(-25) h9 + 2^(-1))
1383 so floor(2^(-255)(h + 19 2^(-25) h9 + 2^(-1))) = q.
1386 void fe_tobytes(unsigned char *s, const fe h) {
1408 q = (19 * h9 + (((int32_t) 1) << 24)) >> 25;
1419 /* Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20. */
1421 /* Goal: Output h-2^255 q, which is between 0 and 2^255-20. */
1424 h0 -= shl32(carry0, 26);
1427 h1 -= shl32(carry1, 25);
1430 h2 -= shl32(carry2, 26);
1433 h3 -= shl32(carry3, 25);
1436 h4 -= shl32(carry4, 26);
1439 h5 -= shl32(carry5, 25);
1442 h6 -= shl32(carry6, 26);
1445 h7 -= shl32(carry7, 25);
1448 h8 -= shl32(carry8, 26);
1450 h9 -= shl32(carry9, 25);
1454 Goal: Output h0+...+2^255 h10-2^255 q, which is between 0 and 2^255-20.
1455 Have h0+...+2^230 h9 between 0 and 2^255-1;
1456 evidently 2^255 h10-2^255 q = 0.
1457 Goal: Output h0+...+2^230 h9.
1459 s[0] = (unsigned char) (h0 >> 0);
1460 s[1] = (unsigned char) (h0 >> 8);
1461 s[2] = (unsigned char) (h0 >> 16);
1462 s[3] = (unsigned char) ((h0 >> 24) | shl32(h1, 2));
1463 s[4] = (unsigned char) (h1 >> 6);
1464 s[5] = (unsigned char) (h1 >> 14);
1465 s[6] = (unsigned char) ((h1 >> 22) | shl32(h2, 3));
1466 s[7] = (unsigned char) (h2 >> 5);
1467 s[8] = (unsigned char) (h2 >> 13);
1468 s[9] = (unsigned char) ((h2 >> 21) | shl32(h3, 5));
1469 s[10] = (unsigned char) (h3 >> 3);
1470 s[11] = (unsigned char) (h3 >> 11);
1471 s[12] = (unsigned char) ((h3 >> 19) | shl32(h4, 6));
1472 s[13] = (unsigned char) (h4 >> 2);
1473 s[14] = (unsigned char) (h4 >> 10);
1474 s[15] = (unsigned char) (h4 >> 18);
1475 s[16] = (unsigned char) (h5 >> 0);
1476 s[17] = (unsigned char) (h5 >> 8);
1477 s[18] = (unsigned char) (h5 >> 16);
1478 s[19] = (unsigned char) ((h5 >> 24) | shl32(h6, 1));
1479 s[20] = (unsigned char) (h6 >> 7);
1480 s[21] = (unsigned char) (h6 >> 15);
1481 s[22] = (unsigned char) ((h6 >> 23) | shl32(h7, 3));
1482 s[23] = (unsigned char) (h7 >> 5);
1483 s[24] = (unsigned char) (h7 >> 13);
1484 s[25] = (unsigned char) ((h7 >> 21) | shl32(h8, 4));
1485 s[26] = (unsigned char) (h8 >> 4);
1486 s[27] = (unsigned char) (h8 >> 12);
1487 s[28] = (unsigned char) ((h8 >> 20) | shl32(h9, 6));
1488 s[29] = (unsigned char) (h9 >> 2);
1489 s[30] = (unsigned char) (h9 >> 10);
1490 s[31] = (unsigned char) (h9 >> 18);