www.tinc-vpn.org Git - tinc/blob - src/ed25519/fe.c

   1 #include "fixedint.h"
   2 #include "fe.h"
   3
   4
   5 /*
   6     helper functions
   7 */
   8 static uint64_t load_3(const unsigned char *in) {
   9     uint64_t result;
  10
  11     result = in[0];
  12     result |= shlu64(in[1], 8);
  13     result |= shlu64(in[2], 16);
  14
  15     return result;
  16 }
  17
  18 static uint64_t load_4(const unsigned char *in) {
  19     uint64_t result;
  20
  21     result = in[0];
  22     result |= shlu64(in[1], 8);
  23     result |= shlu64(in[2], 16);
  24     result |= shlu64(in[3], 24);
  25
  26     return result;
  27 }
  28
  29
  30
  31 /*
  32     h = 0
  33 */
  34
  35 void fe_0(fe h) {
  36     h[0] = 0;
  37     h[1] = 0;
  38     h[2] = 0;
  39     h[3] = 0;
  40     h[4] = 0;
  41     h[5] = 0;
  42     h[6] = 0;
  43     h[7] = 0;
  44     h[8] = 0;
  45     h[9] = 0;
  46 }
  47
  48
  49
  50 /*
  51     h = 1
  52 */
  53
  54 void fe_1(fe h) {
  55     h[0] = 1;
  56     h[1] = 0;
  57     h[2] = 0;
  58     h[3] = 0;
  59     h[4] = 0;
  60     h[5] = 0;
  61     h[6] = 0;
  62     h[7] = 0;
  63     h[8] = 0;
  64     h[9] = 0;
  65 }
  66
  67
  68
  69 /*
  70     h = f + g
  71     Can overlap h with f or g.
  72
  73     Preconditions:
  74        |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
  75        |g| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
  76
  77     Postconditions:
  78        |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
  79 */
  80
  81 void fe_add(fe h, const fe f, const fe g) {
  82     int32_t f0 = f[0];
  83     int32_t f1 = f[1];
  84     int32_t f2 = f[2];
  85     int32_t f3 = f[3];
  86     int32_t f4 = f[4];
  87     int32_t f5 = f[5];
  88     int32_t f6 = f[6];
  89     int32_t f7 = f[7];
  90     int32_t f8 = f[8];
  91     int32_t f9 = f[9];
  92     int32_t g0 = g[0];
  93     int32_t g1 = g[1];
  94     int32_t g2 = g[2];
  95     int32_t g3 = g[3];
  96     int32_t g4 = g[4];
  97     int32_t g5 = g[5];
  98     int32_t g6 = g[6];
  99     int32_t g7 = g[7];
 100     int32_t g8 = g[8];
 101     int32_t g9 = g[9];
 102     int32_t h0 = f0 + g0;
 103     int32_t h1 = f1 + g1;
 104     int32_t h2 = f2 + g2;
 105     int32_t h3 = f3 + g3;
 106     int32_t h4 = f4 + g4;
 107     int32_t h5 = f5 + g5;
 108     int32_t h6 = f6 + g6;
 109     int32_t h7 = f7 + g7;
 110     int32_t h8 = f8 + g8;
 111     int32_t h9 = f9 + g9;
 112
 113     h[0] = h0;
 114     h[1] = h1;
 115     h[2] = h2;
 116     h[3] = h3;
 117     h[4] = h4;
 118     h[5] = h5;
 119     h[6] = h6;
 120     h[7] = h7;
 121     h[8] = h8;
 122     h[9] = h9;
 123 }
 124
 125
 126
 127 /*
 128     Replace (f,g) with (g,g) if b == 1;
 129     replace (f,g) with (f,g) if b == 0.
 130
 131     Preconditions: b in {0,1}.
 132 */
 133
 134 void fe_cmov(fe f, const fe g, unsigned int b) {
 135     int32_t f0 = f[0];
 136     int32_t f1 = f[1];
 137     int32_t f2 = f[2];
 138     int32_t f3 = f[3];
 139     int32_t f4 = f[4];
 140     int32_t f5 = f[5];
 141     int32_t f6 = f[6];
 142     int32_t f7 = f[7];
 143     int32_t f8 = f[8];
 144     int32_t f9 = f[9];
 145     int32_t g0 = g[0];
 146     int32_t g1 = g[1];
 147     int32_t g2 = g[2];
 148     int32_t g3 = g[3];
 149     int32_t g4 = g[4];
 150     int32_t g5 = g[5];
 151     int32_t g6 = g[6];
 152     int32_t g7 = g[7];
 153     int32_t g8 = g[8];
 154     int32_t g9 = g[9];
 155     int32_t x0 = f0 ^ g0;
 156     int32_t x1 = f1 ^ g1;
 157     int32_t x2 = f2 ^ g2;
 158     int32_t x3 = f3 ^ g3;
 159     int32_t x4 = f4 ^ g4;
 160     int32_t x5 = f5 ^ g5;
 161     int32_t x6 = f6 ^ g6;
 162     int32_t x7 = f7 ^ g7;
 163     int32_t x8 = f8 ^ g8;
 164     int32_t x9 = f9 ^ g9;
 165
 166     b = (unsigned int) (- (int) b); /* silence warning */
 167     x0 &= b;
 168     x1 &= b;
 169     x2 &= b;
 170     x3 &= b;
 171     x4 &= b;
 172     x5 &= b;
 173     x6 &= b;
 174     x7 &= b;
 175     x8 &= b;
 176     x9 &= b;
 177
 178     f[0] = f0 ^ x0;
 179     f[1] = f1 ^ x1;
 180     f[2] = f2 ^ x2;
 181     f[3] = f3 ^ x3;
 182     f[4] = f4 ^ x4;
 183     f[5] = f5 ^ x5;
 184     f[6] = f6 ^ x6;
 185     f[7] = f7 ^ x7;
 186     f[8] = f8 ^ x8;
 187     f[9] = f9 ^ x9;
 188 }
 189
 190 /*
 191     Replace (f,g) with (g,f) if b == 1;
 192     replace (f,g) with (f,g) if b == 0.
 193
 194     Preconditions: b in {0,1}.
 195 */
 196
 197 void fe_cswap(fe f,fe g,unsigned int b) {
 198     int32_t f0 = f[0];
 199     int32_t f1 = f[1];
 200     int32_t f2 = f[2];
 201     int32_t f3 = f[3];
 202     int32_t f4 = f[4];
 203     int32_t f5 = f[5];
 204     int32_t f6 = f[6];
 205     int32_t f7 = f[7];
 206     int32_t f8 = f[8];
 207     int32_t f9 = f[9];
 208     int32_t g0 = g[0];
 209     int32_t g1 = g[1];
 210     int32_t g2 = g[2];
 211     int32_t g3 = g[3];
 212     int32_t g4 = g[4];
 213     int32_t g5 = g[5];
 214     int32_t g6 = g[6];
 215     int32_t g7 = g[7];
 216     int32_t g8 = g[8];
 217     int32_t g9 = g[9];
 218     int32_t x0 = f0 ^ g0;
 219     int32_t x1 = f1 ^ g1;
 220     int32_t x2 = f2 ^ g2;
 221     int32_t x3 = f3 ^ g3;
 222     int32_t x4 = f4 ^ g4;
 223     int32_t x5 = f5 ^ g5;
 224     int32_t x6 = f6 ^ g6;
 225     int32_t x7 = f7 ^ g7;
 226     int32_t x8 = f8 ^ g8;
 227     int32_t x9 = f9 ^ g9;
 228     b = -b;
 229     x0 &= b;
 230     x1 &= b;
 231     x2 &= b;
 232     x3 &= b;
 233     x4 &= b;
 234     x5 &= b;
 235     x6 &= b;
 236     x7 &= b;
 237     x8 &= b;
 238     x9 &= b;
 239     f[0] = f0 ^ x0;
 240     f[1] = f1 ^ x1;
 241     f[2] = f2 ^ x2;
 242     f[3] = f3 ^ x3;
 243     f[4] = f4 ^ x4;
 244     f[5] = f5 ^ x5;
 245     f[6] = f6 ^ x6;
 246     f[7] = f7 ^ x7;
 247     f[8] = f8 ^ x8;
 248     f[9] = f9 ^ x9;
 249     g[0] = g0 ^ x0;
 250     g[1] = g1 ^ x1;
 251     g[2] = g2 ^ x2;
 252     g[3] = g3 ^ x3;
 253     g[4] = g4 ^ x4;
 254     g[5] = g5 ^ x5;
 255     g[6] = g6 ^ x6;
 256     g[7] = g7 ^ x7;
 257     g[8] = g8 ^ x8;
 258     g[9] = g9 ^ x9;
 259 }
 260
 261
 262
 263 /*
 264     h = f
 265 */
 266
 267 void fe_copy(fe h, const fe f) {
 268     int32_t f0 = f[0];
 269     int32_t f1 = f[1];
 270     int32_t f2 = f[2];
 271     int32_t f3 = f[3];
 272     int32_t f4 = f[4];
 273     int32_t f5 = f[5];
 274     int32_t f6 = f[6];
 275     int32_t f7 = f[7];
 276     int32_t f8 = f[8];
 277     int32_t f9 = f[9];
 278
 279     h[0] = f0;
 280     h[1] = f1;
 281     h[2] = f2;
 282     h[3] = f3;
 283     h[4] = f4;
 284     h[5] = f5;
 285     h[6] = f6;
 286     h[7] = f7;
 287     h[8] = f8;
 288     h[9] = f9;
 289 }
 290
 291
 292
 293 /*
 294     Ignores top bit of h.
 295 */
 296
 297 void fe_frombytes(fe h, const unsigned char *s) {
 298     int64_t h0 = load_4(s);
 299     int64_t h1 = load_3(s + 4) << 6;
 300     int64_t h2 = load_3(s + 7) << 5;
 301     int64_t h3 = load_3(s + 10) << 3;
 302     int64_t h4 = load_3(s + 13) << 2;
 303     int64_t h5 = load_4(s + 16);
 304     int64_t h6 = load_3(s + 20) << 7;
 305     int64_t h7 = load_3(s + 23) << 5;
 306     int64_t h8 = load_3(s + 26) << 4;
 307     int64_t h9 = (load_3(s + 29) & 8388607) << 2;
 308     int64_t carry0;
 309     int64_t carry1;
 310     int64_t carry2;
 311     int64_t carry3;
 312     int64_t carry4;
 313     int64_t carry5;
 314     int64_t carry6;
 315     int64_t carry7;
 316     int64_t carry8;
 317     int64_t carry9;
 318
 319     carry9 = (h9 + (1L << 24)) >> 25;
 320     h0 += carry9 * 19;
 321     h9 -= shl64(carry9, 25);
 322     carry1 = (h1 + (1L << 24)) >> 25;
 323     h2 += carry1;
 324     h1 -= shl64(carry1, 25);
 325     carry3 = (h3 + (1L << 24)) >> 25;
 326     h4 += carry3;
 327     h3 -= shl64(carry3, 25);
 328     carry5 = (h5 + (1L << 24)) >> 25;
 329     h6 += carry5;
 330     h5 -= shl64(carry5, 25);
 331     carry7 = (h7 + (1L << 24)) >> 25;
 332     h8 += carry7;
 333     h7 -= shl64(carry7, 25);
 334     carry0 = (h0 + (1L << 25)) >> 26;
 335     h1 += carry0;
 336     h0 -= shl64(carry0, 26);
 337     carry2 = (h2 + (1L << 25)) >> 26;
 338     h3 += carry2;
 339     h2 -= shl64(carry2, 26);
 340     carry4 = (h4 + (1L << 25)) >> 26;
 341     h5 += carry4;
 342     h4 -= shl64(carry4, 26);
 343     carry6 = (h6 + (1L << 25)) >> 26;
 344     h7 += carry6;
 345     h6 -= shl64(carry6, 26);
 346     carry8 = (h8 + (1L << 25)) >> 26;
 347     h9 += carry8;
 348     h8 -= shl64(carry8, 26);
 349
 350     h[0] = h0;
 351     h[1] = h1;
 352     h[2] = h2;
 353     h[3] = h3;
 354     h[4] = h4;
 355     h[5] = h5;
 356     h[6] = h6;
 357     h[7] = h7;
 358     h[8] = h8;
 359     h[9] = h9;
 360 }
 361
 362
 363
 364 void fe_invert(fe out, const fe z) {
 365     fe t0;
 366     fe t1;
 367     fe t2;
 368     fe t3;
 369     int i;
 370
 371     fe_sq(t0, z);
 372
 373     for (i = 1; i < 1; ++i) {
 374         fe_sq(t0, t0);
 375     }
 376
 377     fe_sq(t1, t0);
 378
 379     for (i = 1; i < 2; ++i) {
 380         fe_sq(t1, t1);
 381     }
 382
 383     fe_mul(t1, z, t1);
 384     fe_mul(t0, t0, t1);
 385     fe_sq(t2, t0);
 386
 387     for (i = 1; i < 1; ++i) {
 388         fe_sq(t2, t2);
 389     }
 390
 391     fe_mul(t1, t1, t2);
 392     fe_sq(t2, t1);
 393
 394     for (i = 1; i < 5; ++i) {
 395         fe_sq(t2, t2);
 396     }
 397
 398     fe_mul(t1, t2, t1);
 399     fe_sq(t2, t1);
 400
 401     for (i = 1; i < 10; ++i) {
 402         fe_sq(t2, t2);
 403     }
 404
 405     fe_mul(t2, t2, t1);
 406     fe_sq(t3, t2);
 407
 408     for (i = 1; i < 20; ++i) {
 409         fe_sq(t3, t3);
 410     }
 411
 412     fe_mul(t2, t3, t2);
 413     fe_sq(t2, t2);
 414
 415     for (i = 1; i < 10; ++i) {
 416         fe_sq(t2, t2);
 417     }
 418
 419     fe_mul(t1, t2, t1);
 420     fe_sq(t2, t1);
 421
 422     for (i = 1; i < 50; ++i) {
 423         fe_sq(t2, t2);
 424     }
 425
 426     fe_mul(t2, t2, t1);
 427     fe_sq(t3, t2);
 428
 429     for (i = 1; i < 100; ++i) {
 430         fe_sq(t3, t3);
 431     }
 432
 433     fe_mul(t2, t3, t2);
 434     fe_sq(t2, t2);
 435
 436     for (i = 1; i < 50; ++i) {
 437         fe_sq(t2, t2);
 438     }
 439
 440     fe_mul(t1, t2, t1);
 441     fe_sq(t1, t1);
 442
 443     for (i = 1; i < 5; ++i) {
 444         fe_sq(t1, t1);
 445     }
 446
 447     fe_mul(out, t1, t0);
 448 }
 449
 450
 451
 452 /*
 453     return 1 if f is in {1,3,5,...,q-2}
 454     return 0 if f is in {0,2,4,...,q-1}
 455
 456     Preconditions:
 457        |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
 458 */
 459
 460 int fe_isnegative(const fe f) {
 461     unsigned char s[32];
 462
 463     fe_tobytes(s, f);
 464
 465     return s[0] & 1;
 466 }
 467
 468
 469
 470 /*
 471     return 1 if f == 0
 472     return 0 if f != 0
 473
 474     Preconditions:
 475        |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
 476 */
 477
 478 int fe_isnonzero(const fe f) {
 479     unsigned char s[32];
 480     unsigned char r;
 481
 482     fe_tobytes(s, f);
 483
 484     r = s[0];
 485     #define F(i) r |= s[i]
 486     F(1);
 487     F(2);
 488     F(3);
 489     F(4);
 490     F(5);
 491     F(6);
 492     F(7);
 493     F(8);
 494     F(9);
 495     F(10);
 496     F(11);
 497     F(12);
 498     F(13);
 499     F(14);
 500     F(15);
 501     F(16);
 502     F(17);
 503     F(18);
 504     F(19);
 505     F(20);
 506     F(21);
 507     F(22);
 508     F(23);
 509     F(24);
 510     F(25);
 511     F(26);
 512     F(27);
 513     F(28);
 514     F(29);
 515     F(30);
 516     F(31);
 517     #undef F
 518
 519     return r != 0;
 520 }
 521
 522
 523
 524 /*
 525     h = f * g
 526     Can overlap h with f or g.
 527
 528     Preconditions:
 529        |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
 530        |g| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
 531
 532     Postconditions:
 533        |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
 534     */
 535
 536     /*
 537     Notes on implementation strategy:
 538
 539     Using schoolbook multiplication.
 540     Karatsuba would save a little in some cost models.
 541
 542     Most multiplications by 2 and 19 are 32-bit precomputations;
 543     cheaper than 64-bit postcomputations.
 544
 545     There is one remaining multiplication by 19 in the carry chain;
 546     one *19 precomputation can be merged into this,
 547     but the resulting data flow is considerably less clean.
 548
 549     There are 12 carries below.
 550     10 of them are 2-way parallelizable and vectorizable.
 551     Can get away with 11 carries, but then data flow is much deeper.
 552
 553     With tighter constraints on inputs can squeeze carries into int32.
 554 */
 555
 556 void fe_mul(fe h, const fe f, const fe g) {
 557     int32_t f0 = f[0];
 558     int32_t f1 = f[1];
 559     int32_t f2 = f[2];
 560     int32_t f3 = f[3];
 561     int32_t f4 = f[4];
 562     int32_t f5 = f[5];
 563     int32_t f6 = f[6];
 564     int32_t f7 = f[7];
 565     int32_t f8 = f[8];
 566     int32_t f9 = f[9];
 567     int32_t g0 = g[0];
 568     int32_t g1 = g[1];
 569     int32_t g2 = g[2];
 570     int32_t g3 = g[3];
 571     int32_t g4 = g[4];
 572     int32_t g5 = g[5];
 573     int32_t g6 = g[6];
 574     int32_t g7 = g[7];
 575     int32_t g8 = g[8];
 576     int32_t g9 = g[9];
 577     int32_t g1_19 = 19 * g1; /* 1.959375*2^29 */
 578     int32_t g2_19 = 19 * g2; /* 1.959375*2^30; still ok */
 579     int32_t g3_19 = 19 * g3;
 580     int32_t g4_19 = 19 * g4;
 581     int32_t g5_19 = 19 * g5;
 582     int32_t g6_19 = 19 * g6;
 583     int32_t g7_19 = 19 * g7;
 584     int32_t g8_19 = 19 * g8;
 585     int32_t g9_19 = 19 * g9;
 586     int32_t f1_2 = 2 * f1;
 587     int32_t f3_2 = 2 * f3;
 588     int32_t f5_2 = 2 * f5;
 589     int32_t f7_2 = 2 * f7;
 590     int32_t f9_2 = 2 * f9;
 591     int64_t f0g0    = f0   * (int64_t) g0;
 592     int64_t f0g1    = f0   * (int64_t) g1;
 593     int64_t f0g2    = f0   * (int64_t) g2;
 594     int64_t f0g3    = f0   * (int64_t) g3;
 595     int64_t f0g4    = f0   * (int64_t) g4;
 596     int64_t f0g5    = f0   * (int64_t) g5;
 597     int64_t f0g6    = f0   * (int64_t) g6;
 598     int64_t f0g7    = f0   * (int64_t) g7;
 599     int64_t f0g8    = f0   * (int64_t) g8;
 600     int64_t f0g9    = f0   * (int64_t) g9;
 601     int64_t f1g0    = f1   * (int64_t) g0;
 602     int64_t f1g1_2  = f1_2 * (int64_t) g1;
 603     int64_t f1g2    = f1   * (int64_t) g2;
 604     int64_t f1g3_2  = f1_2 * (int64_t) g3;
 605     int64_t f1g4    = f1   * (int64_t) g4;
 606     int64_t f1g5_2  = f1_2 * (int64_t) g5;
 607     int64_t f1g6    = f1   * (int64_t) g6;
 608     int64_t f1g7_2  = f1_2 * (int64_t) g7;
 609     int64_t f1g8    = f1   * (int64_t) g8;
 610     int64_t f1g9_38 = f1_2 * (int64_t) g9_19;
 611     int64_t f2g0    = f2   * (int64_t) g0;
 612     int64_t f2g1    = f2   * (int64_t) g1;
 613     int64_t f2g2    = f2   * (int64_t) g2;
 614     int64_t f2g3    = f2   * (int64_t) g3;
 615     int64_t f2g4    = f2   * (int64_t) g4;
 616     int64_t f2g5    = f2   * (int64_t) g5;
 617     int64_t f2g6    = f2   * (int64_t) g6;
 618     int64_t f2g7    = f2   * (int64_t) g7;
 619     int64_t f2g8_19 = f2   * (int64_t) g8_19;
 620     int64_t f2g9_19 = f2   * (int64_t) g9_19;
 621     int64_t f3g0    = f3   * (int64_t) g0;
 622     int64_t f3g1_2  = f3_2 * (int64_t) g1;
 623     int64_t f3g2    = f3   * (int64_t) g2;
 624     int64_t f3g3_2  = f3_2 * (int64_t) g3;
 625     int64_t f3g4    = f3   * (int64_t) g4;
 626     int64_t f3g5_2  = f3_2 * (int64_t) g5;
 627     int64_t f3g6    = f3   * (int64_t) g6;
 628     int64_t f3g7_38 = f3_2 * (int64_t) g7_19;
 629     int64_t f3g8_19 = f3   * (int64_t) g8_19;
 630     int64_t f3g9_38 = f3_2 * (int64_t) g9_19;
 631     int64_t f4g0    = f4   * (int64_t) g0;
 632     int64_t f4g1    = f4   * (int64_t) g1;
 633     int64_t f4g2    = f4   * (int64_t) g2;
 634     int64_t f4g3    = f4   * (int64_t) g3;
 635     int64_t f4g4    = f4   * (int64_t) g4;
 636     int64_t f4g5    = f4   * (int64_t) g5;
 637     int64_t f4g6_19 = f4   * (int64_t) g6_19;
 638     int64_t f4g7_19 = f4   * (int64_t) g7_19;
 639     int64_t f4g8_19 = f4   * (int64_t) g8_19;
 640     int64_t f4g9_19 = f4   * (int64_t) g9_19;
 641     int64_t f5g0    = f5   * (int64_t) g0;
 642     int64_t f5g1_2  = f5_2 * (int64_t) g1;
 643     int64_t f5g2    = f5   * (int64_t) g2;
 644     int64_t f5g3_2  = f5_2 * (int64_t) g3;
 645     int64_t f5g4    = f5   * (int64_t) g4;
 646     int64_t f5g5_38 = f5_2 * (int64_t) g5_19;
 647     int64_t f5g6_19 = f5   * (int64_t) g6_19;
 648     int64_t f5g7_38 = f5_2 * (int64_t) g7_19;
 649     int64_t f5g8_19 = f5   * (int64_t) g8_19;
 650     int64_t f5g9_38 = f5_2 * (int64_t) g9_19;
 651     int64_t f6g0    = f6   * (int64_t) g0;
 652     int64_t f6g1    = f6   * (int64_t) g1;
 653     int64_t f6g2    = f6   * (int64_t) g2;
 654     int64_t f6g3    = f6   * (int64_t) g3;
 655     int64_t f6g4_19 = f6   * (int64_t) g4_19;
 656     int64_t f6g5_19 = f6   * (int64_t) g5_19;
 657     int64_t f6g6_19 = f6   * (int64_t) g6_19;
 658     int64_t f6g7_19 = f6   * (int64_t) g7_19;
 659     int64_t f6g8_19 = f6   * (int64_t) g8_19;
 660     int64_t f6g9_19 = f6   * (int64_t) g9_19;
 661     int64_t f7g0    = f7   * (int64_t) g0;
 662     int64_t f7g1_2  = f7_2 * (int64_t) g1;
 663     int64_t f7g2    = f7   * (int64_t) g2;
 664     int64_t f7g3_38 = f7_2 * (int64_t) g3_19;
 665     int64_t f7g4_19 = f7   * (int64_t) g4_19;
 666     int64_t f7g5_38 = f7_2 * (int64_t) g5_19;
 667     int64_t f7g6_19 = f7   * (int64_t) g6_19;
 668     int64_t f7g7_38 = f7_2 * (int64_t) g7_19;
 669     int64_t f7g8_19 = f7   * (int64_t) g8_19;
 670     int64_t f7g9_38 = f7_2 * (int64_t) g9_19;
 671     int64_t f8g0    = f8   * (int64_t) g0;
 672     int64_t f8g1    = f8   * (int64_t) g1;
 673     int64_t f8g2_19 = f8   * (int64_t) g2_19;
 674     int64_t f8g3_19 = f8   * (int64_t) g3_19;
 675     int64_t f8g4_19 = f8   * (int64_t) g4_19;
 676     int64_t f8g5_19 = f8   * (int64_t) g5_19;
 677     int64_t f8g6_19 = f8   * (int64_t) g6_19;
 678     int64_t f8g7_19 = f8   * (int64_t) g7_19;
 679     int64_t f8g8_19 = f8   * (int64_t) g8_19;
 680     int64_t f8g9_19 = f8   * (int64_t) g9_19;
 681     int64_t f9g0    = f9   * (int64_t) g0;
 682     int64_t f9g1_38 = f9_2 * (int64_t) g1_19;
 683     int64_t f9g2_19 = f9   * (int64_t) g2_19;
 684     int64_t f9g3_38 = f9_2 * (int64_t) g3_19;
 685     int64_t f9g4_19 = f9   * (int64_t) g4_19;
 686     int64_t f9g5_38 = f9_2 * (int64_t) g5_19;
 687     int64_t f9g6_19 = f9   * (int64_t) g6_19;
 688     int64_t f9g7_38 = f9_2 * (int64_t) g7_19;
 689     int64_t f9g8_19 = f9   * (int64_t) g8_19;
 690     int64_t f9g9_38 = f9_2 * (int64_t) g9_19;
 691     int64_t h0 = f0g0 + f1g9_38 + f2g8_19 + f3g7_38 + f4g6_19 + f5g5_38 + f6g4_19 + f7g3_38 + f8g2_19 + f9g1_38;
 692     int64_t h1 = f0g1 + f1g0   + f2g9_19 + f3g8_19 + f4g7_19 + f5g6_19 + f6g5_19 + f7g4_19 + f8g3_19 + f9g2_19;
 693     int64_t h2 = f0g2 + f1g1_2 + f2g0   + f3g9_38 + f4g8_19 + f5g7_38 + f6g6_19 + f7g5_38 + f8g4_19 + f9g3_38;
 694     int64_t h3 = f0g3 + f1g2   + f2g1   + f3g0   + f4g9_19 + f5g8_19 + f6g7_19 + f7g6_19 + f8g5_19 + f9g4_19;
 695     int64_t h4 = f0g4 + f1g3_2 + f2g2   + f3g1_2 + f4g0   + f5g9_38 + f6g8_19 + f7g7_38 + f8g6_19 + f9g5_38;
 696     int64_t h5 = f0g5 + f1g4   + f2g3   + f3g2   + f4g1   + f5g0   + f6g9_19 + f7g8_19 + f8g7_19 + f9g6_19;
 697     int64_t h6 = f0g6 + f1g5_2 + f2g4   + f3g3_2 + f4g2   + f5g1_2 + f6g0   + f7g9_38 + f8g8_19 + f9g7_38;
 698     int64_t h7 = f0g7 + f1g6   + f2g5   + f3g4   + f4g3   + f5g2   + f6g1   + f7g0   + f8g9_19 + f9g8_19;
 699     int64_t h8 = f0g8 + f1g7_2 + f2g6   + f3g5_2 + f4g4   + f5g3_2 + f6g2   + f7g1_2 + f8g0   + f9g9_38;
 700     int64_t h9 = f0g9 + f1g8   + f2g7   + f3g6   + f4g5   + f5g4   + f6g3   + f7g2   + f8g1   + f9g0   ;
 701     int64_t carry0;
 702     int64_t carry1;
 703     int64_t carry2;
 704     int64_t carry3;
 705     int64_t carry4;
 706     int64_t carry5;
 707     int64_t carry6;
 708     int64_t carry7;
 709     int64_t carry8;
 710     int64_t carry9;
 711
 712     carry0 = (h0 + (1L << 25)) >> 26;
 713     h1 += carry0;
 714     h0 -= shl64(carry0, 26);
 715     carry4 = (h4 + (1L << 25)) >> 26;
 716     h5 += carry4;
 717     h4 -= shl64(carry4, 26);
 718
 719     carry1 = (h1 + (1L << 24)) >> 25;
 720     h2 += carry1;
 721     h1 -= shl64(carry1, 25);
 722     carry5 = (h5 + (1L << 24)) >> 25;
 723     h6 += carry5;
 724     h5 -= shl64(carry5, 25);
 725
 726     carry2 = (h2 + (1L << 25)) >> 26;
 727     h3 += carry2;
 728     h2 -= shl64(carry2, 26);
 729     carry6 = (h6 + (1L << 25)) >> 26;
 730     h7 += carry6;
 731     h6 -= shl64(carry6, 26);
 732
 733     carry3 = (h3 + (1L << 24)) >> 25;
 734     h4 += carry3;
 735     h3 -= shl64(carry3, 25);
 736     carry7 = (h7 + (1L << 24)) >> 25;
 737     h8 += carry7;
 738     h7 -= shl64(carry7, 25);
 739
 740     carry4 = (h4 + (1L << 25)) >> 26;
 741     h5 += carry4;
 742     h4 -= shl64(carry4, 26);
 743     carry8 = (h8 + (1L << 25)) >> 26;
 744     h9 += carry8;
 745     h8 -= shl64(carry8, 26);
 746
 747     carry9 = (h9 + (1L << 24)) >> 25;
 748     h0 += carry9 * 19;
 749     h9 -= shl64(carry9, 25);
 750
 751     carry0 = (h0 + (1L << 25)) >> 26;
 752     h1 += carry0;
 753     h0 -= shl64(carry0, 26);
 754
 755     h[0] = (int32_t) h0;
 756     h[1] = (int32_t) h1;
 757     h[2] = (int32_t) h2;
 758     h[3] = (int32_t) h3;
 759     h[4] = (int32_t) h4;
 760     h[5] = (int32_t) h5;
 761     h[6] = (int32_t) h6;
 762     h[7] = (int32_t) h7;
 763     h[8] = (int32_t) h8;
 764     h[9] = (int32_t) h9;
 765 }
 766
 767
 768 /*
 769 h = f * 121666
 770 Can overlap h with f.
 771
 772 Preconditions:
 773    |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
 774
 775 Postconditions:
 776    |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
 777 */
 778
 779 void fe_mul121666(fe h, fe f) {
 780     int32_t f0 = f[0];
 781     int32_t f1 = f[1];
 782     int32_t f2 = f[2];
 783     int32_t f3 = f[3];
 784     int32_t f4 = f[4];
 785     int32_t f5 = f[5];
 786     int32_t f6 = f[6];
 787     int32_t f7 = f[7];
 788     int32_t f8 = f[8];
 789     int32_t f9 = f[9];
 790     int64_t h0 = f0 * (int64_t) 121666;
 791     int64_t h1 = f1 * (int64_t) 121666;
 792     int64_t h2 = f2 * (int64_t) 121666;
 793     int64_t h3 = f3 * (int64_t) 121666;
 794     int64_t h4 = f4 * (int64_t) 121666;
 795     int64_t h5 = f5 * (int64_t) 121666;
 796     int64_t h6 = f6 * (int64_t) 121666;
 797     int64_t h7 = f7 * (int64_t) 121666;
 798     int64_t h8 = f8 * (int64_t) 121666;
 799     int64_t h9 = f9 * (int64_t) 121666;
 800     int64_t carry0;
 801     int64_t carry1;
 802     int64_t carry2;
 803     int64_t carry3;
 804     int64_t carry4;
 805     int64_t carry5;
 806     int64_t carry6;
 807     int64_t carry7;
 808     int64_t carry8;
 809     int64_t carry9;
 810
 811     carry9 = (h9 + (int64_t) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= shl64(carry9, 25);
 812     carry1 = (h1 + (int64_t) (1<<24)) >> 25; h2 += carry1; h1 -= shl64(carry1, 25);
 813     carry3 = (h3 + (int64_t) (1<<24)) >> 25; h4 += carry3; h3 -= shl64(carry3, 25);
 814     carry5 = (h5 + (int64_t) (1<<24)) >> 25; h6 += carry5; h5 -= shl64(carry5, 25);
 815     carry7 = (h7 + (int64_t) (1<<24)) >> 25; h8 += carry7; h7 -= shl64(carry7, 25);
 816
 817     carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= shl64(carry0, 26);
 818     carry2 = (h2 + (int64_t) (1<<25)) >> 26; h3 += carry2; h2 -= shl64(carry2, 26);
 819     carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= shl64(carry4, 26);
 820     carry6 = (h6 + (int64_t) (1<<25)) >> 26; h7 += carry6; h6 -= shl64(carry6, 26);
 821     carry8 = (h8 + (int64_t) (1<<25)) >> 26; h9 += carry8; h8 -= shl64(carry8, 26);
 822
 823     h[0] = h0;
 824     h[1] = h1;
 825     h[2] = h2;
 826     h[3] = h3;
 827     h[4] = h4;
 828     h[5] = h5;
 829     h[6] = h6;
 830     h[7] = h7;
 831     h[8] = h8;
 832     h[9] = h9;
 833 }
 834
 835
 836 /*
 837 h = -f
 838
 839 Preconditions:
 840    |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
 841
 842 Postconditions:
 843    |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
 844 */
 845
 846 void fe_neg(fe h, const fe f) {
 847     int32_t f0 = f[0];
 848     int32_t f1 = f[1];
 849     int32_t f2 = f[2];
 850     int32_t f3 = f[3];
 851     int32_t f4 = f[4];
 852     int32_t f5 = f[5];
 853     int32_t f6 = f[6];
 854     int32_t f7 = f[7];
 855     int32_t f8 = f[8];
 856     int32_t f9 = f[9];
 857     int32_t h0 = -f0;
 858     int32_t h1 = -f1;
 859     int32_t h2 = -f2;
 860     int32_t h3 = -f3;
 861     int32_t h4 = -f4;
 862     int32_t h5 = -f5;
 863     int32_t h6 = -f6;
 864     int32_t h7 = -f7;
 865     int32_t h8 = -f8;
 866     int32_t h9 = -f9;
 867
 868     h[0] = h0;
 869     h[1] = h1;
 870     h[2] = h2;
 871     h[3] = h3;
 872     h[4] = h4;
 873     h[5] = h5;
 874     h[6] = h6;
 875     h[7] = h7;
 876     h[8] = h8;
 877     h[9] = h9;
 878 }
 879
 880
 881 void fe_pow22523(fe out, const fe z) {
 882     fe t0;
 883     fe t1;
 884     fe t2;
 885     int i;
 886     fe_sq(t0, z);
 887
 888     for (i = 1; i < 1; ++i) {
 889         fe_sq(t0, t0);
 890     }
 891
 892     fe_sq(t1, t0);
 893
 894     for (i = 1; i < 2; ++i) {
 895         fe_sq(t1, t1);
 896     }
 897
 898     fe_mul(t1, z, t1);
 899     fe_mul(t0, t0, t1);
 900     fe_sq(t0, t0);
 901
 902     for (i = 1; i < 1; ++i) {
 903         fe_sq(t0, t0);
 904     }
 905
 906     fe_mul(t0, t1, t0);
 907     fe_sq(t1, t0);
 908
 909     for (i = 1; i < 5; ++i) {
 910         fe_sq(t1, t1);
 911     }
 912
 913     fe_mul(t0, t1, t0);
 914     fe_sq(t1, t0);
 915
 916     for (i = 1; i < 10; ++i) {
 917         fe_sq(t1, t1);
 918     }
 919
 920     fe_mul(t1, t1, t0);
 921     fe_sq(t2, t1);
 922
 923     for (i = 1; i < 20; ++i) {
 924         fe_sq(t2, t2);
 925     }
 926
 927     fe_mul(t1, t2, t1);
 928     fe_sq(t1, t1);
 929
 930     for (i = 1; i < 10; ++i) {
 931         fe_sq(t1, t1);
 932     }
 933
 934     fe_mul(t0, t1, t0);
 935     fe_sq(t1, t0);
 936
 937     for (i = 1; i < 50; ++i) {
 938         fe_sq(t1, t1);
 939     }
 940
 941     fe_mul(t1, t1, t0);
 942     fe_sq(t2, t1);
 943
 944     for (i = 1; i < 100; ++i) {
 945         fe_sq(t2, t2);
 946     }
 947
 948     fe_mul(t1, t2, t1);
 949     fe_sq(t1, t1);
 950
 951     for (i = 1; i < 50; ++i) {
 952         fe_sq(t1, t1);
 953     }
 954
 955     fe_mul(t0, t1, t0);
 956     fe_sq(t0, t0);
 957
 958     for (i = 1; i < 2; ++i) {
 959         fe_sq(t0, t0);
 960     }
 961
 962     fe_mul(out, t0, z);
 963     return;
 964 }
 965
 966
 967 /*
 968 h = f * f
 969 Can overlap h with f.
 970
 971 Preconditions:
 972    |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
 973
 974 Postconditions:
 975    |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
 976 */
 977
 978 /*
 979 See fe_mul.c for discussion of implementation strategy.
 980 */
 981
 982 void fe_sq(fe h, const fe f) {
 983     int32_t f0 = f[0];
 984     int32_t f1 = f[1];
 985     int32_t f2 = f[2];
 986     int32_t f3 = f[3];
 987     int32_t f4 = f[4];
 988     int32_t f5 = f[5];
 989     int32_t f6 = f[6];
 990     int32_t f7 = f[7];
 991     int32_t f8 = f[8];
 992     int32_t f9 = f[9];
 993     int32_t f0_2 = 2 * f0;
 994     int32_t f1_2 = 2 * f1;
 995     int32_t f2_2 = 2 * f2;
 996     int32_t f3_2 = 2 * f3;
 997     int32_t f4_2 = 2 * f4;
 998     int32_t f5_2 = 2 * f5;
 999     int32_t f6_2 = 2 * f6;
1000     int32_t f7_2 = 2 * f7;
1001     int32_t f5_38 = 38 * f5; /* 1.959375*2^30 */
1002     int32_t f6_19 = 19 * f6; /* 1.959375*2^30 */
1003     int32_t f7_38 = 38 * f7; /* 1.959375*2^30 */
1004     int32_t f8_19 = 19 * f8; /* 1.959375*2^30 */
1005     int32_t f9_38 = 38 * f9; /* 1.959375*2^30 */
1006     int64_t f0f0    = f0   * (int64_t) f0;
1007     int64_t f0f1_2  = f0_2 * (int64_t) f1;
1008     int64_t f0f2_2  = f0_2 * (int64_t) f2;
1009     int64_t f0f3_2  = f0_2 * (int64_t) f3;
1010     int64_t f0f4_2  = f0_2 * (int64_t) f4;
1011     int64_t f0f5_2  = f0_2 * (int64_t) f5;
1012     int64_t f0f6_2  = f0_2 * (int64_t) f6;
1013     int64_t f0f7_2  = f0_2 * (int64_t) f7;
1014     int64_t f0f8_2  = f0_2 * (int64_t) f8;
1015     int64_t f0f9_2  = f0_2 * (int64_t) f9;
1016     int64_t f1f1_2  = f1_2 * (int64_t) f1;
1017     int64_t f1f2_2  = f1_2 * (int64_t) f2;
1018     int64_t f1f3_4  = f1_2 * (int64_t) f3_2;
1019     int64_t f1f4_2  = f1_2 * (int64_t) f4;
1020     int64_t f1f5_4  = f1_2 * (int64_t) f5_2;
1021     int64_t f1f6_2  = f1_2 * (int64_t) f6;
1022     int64_t f1f7_4  = f1_2 * (int64_t) f7_2;
1023     int64_t f1f8_2  = f1_2 * (int64_t) f8;
1024     int64_t f1f9_76 = f1_2 * (int64_t) f9_38;
1025     int64_t f2f2    = f2   * (int64_t) f2;
1026     int64_t f2f3_2  = f2_2 * (int64_t) f3;
1027     int64_t f2f4_2  = f2_2 * (int64_t) f4;
1028     int64_t f2f5_2  = f2_2 * (int64_t) f5;
1029     int64_t f2f6_2  = f2_2 * (int64_t) f6;
1030     int64_t f2f7_2  = f2_2 * (int64_t) f7;
1031     int64_t f2f8_38 = f2_2 * (int64_t) f8_19;
1032     int64_t f2f9_38 = f2   * (int64_t) f9_38;
1033     int64_t f3f3_2  = f3_2 * (int64_t) f3;
1034     int64_t f3f4_2  = f3_2 * (int64_t) f4;
1035     int64_t f3f5_4  = f3_2 * (int64_t) f5_2;
1036     int64_t f3f6_2  = f3_2 * (int64_t) f6;
1037     int64_t f3f7_76 = f3_2 * (int64_t) f7_38;
1038     int64_t f3f8_38 = f3_2 * (int64_t) f8_19;
1039     int64_t f3f9_76 = f3_2 * (int64_t) f9_38;
1040     int64_t f4f4    = f4   * (int64_t) f4;
1041     int64_t f4f5_2  = f4_2 * (int64_t) f5;
1042     int64_t f4f6_38 = f4_2 * (int64_t) f6_19;
1043     int64_t f4f7_38 = f4   * (int64_t) f7_38;
1044     int64_t f4f8_38 = f4_2 * (int64_t) f8_19;
1045     int64_t f4f9_38 = f4   * (int64_t) f9_38;
1046     int64_t f5f5_38 = f5   * (int64_t) f5_38;
1047     int64_t f5f6_38 = f5_2 * (int64_t) f6_19;
1048     int64_t f5f7_76 = f5_2 * (int64_t) f7_38;
1049     int64_t f5f8_38 = f5_2 * (int64_t) f8_19;
1050     int64_t f5f9_76 = f5_2 * (int64_t) f9_38;
1051     int64_t f6f6_19 = f6   * (int64_t) f6_19;
1052     int64_t f6f7_38 = f6   * (int64_t) f7_38;
1053     int64_t f6f8_38 = f6_2 * (int64_t) f8_19;
1054     int64_t f6f9_38 = f6   * (int64_t) f9_38;
1055     int64_t f7f7_38 = f7   * (int64_t) f7_38;
1056     int64_t f7f8_38 = f7_2 * (int64_t) f8_19;
1057     int64_t f7f9_76 = f7_2 * (int64_t) f9_38;
1058     int64_t f8f8_19 = f8   * (int64_t) f8_19;
1059     int64_t f8f9_38 = f8   * (int64_t) f9_38;
1060     int64_t f9f9_38 = f9   * (int64_t) f9_38;
1061     int64_t h0 = f0f0  + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38;
1062     int64_t h1 = f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38;
1063     int64_t h2 = f0f2_2 + f1f1_2 + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19;
1064     int64_t h3 = f0f3_2 + f1f2_2 + f4f9_38 + f5f8_38 + f6f7_38;
1065     int64_t h4 = f0f4_2 + f1f3_4 + f2f2   + f5f9_76 + f6f8_38 + f7f7_38;
1066     int64_t h5 = f0f5_2 + f1f4_2 + f2f3_2 + f6f9_38 + f7f8_38;
1067     int64_t h6 = f0f6_2 + f1f5_4 + f2f4_2 + f3f3_2 + f7f9_76 + f8f8_19;
1068     int64_t h7 = f0f7_2 + f1f6_2 + f2f5_2 + f3f4_2 + f8f9_38;
1069     int64_t h8 = f0f8_2 + f1f7_4 + f2f6_2 + f3f5_4 + f4f4   + f9f9_38;
1070     int64_t h9 = f0f9_2 + f1f8_2 + f2f7_2 + f3f6_2 + f4f5_2;
1071     int64_t carry0;
1072     int64_t carry1;
1073     int64_t carry2;
1074     int64_t carry3;
1075     int64_t carry4;
1076     int64_t carry5;
1077     int64_t carry6;
1078     int64_t carry7;
1079     int64_t carry8;
1080     int64_t carry9;
1081     carry0 = (h0 + (1L << 25)) >> 26;
1082     h1 += carry0;
1083     h0 -= shl64(carry0, 26);
1084     carry4 = (h4 + (1L << 25)) >> 26;
1085     h5 += carry4;
1086     h4 -= shl64(carry4, 26);
1087     carry1 = (h1 + (1L << 24)) >> 25;
1088     h2 += carry1;
1089     h1 -= shl64(carry1, 25);
1090     carry5 = (h5 + (1L << 24)) >> 25;
1091     h6 += carry5;
1092     h5 -= shl64(carry5, 25);
1093     carry2 = (h2 + (1L << 25)) >> 26;
1094     h3 += carry2;
1095     h2 -= shl64(carry2, 26);
1096     carry6 = (h6 + (1L << 25)) >> 26;
1097     h7 += carry6;
1098     h6 -= shl64(carry6, 26);
1099     carry3 = (h3 + (1L << 24)) >> 25;
1100     h4 += carry3;
1101     h3 -= shl64(carry3, 25);
1102     carry7 = (h7 + (1L << 24)) >> 25;
1103     h8 += carry7;
1104     h7 -= shl64(carry7, 25);
1105     carry4 = (h4 + (1L << 25)) >> 26;
1106     h5 += carry4;
1107     h4 -= shl64(carry4, 26);
1108     carry8 = (h8 + (1L << 25)) >> 26;
1109     h9 += carry8;
1110     h8 -= shl64(carry8, 26);
1111     carry9 = (h9 + (1L << 24)) >> 25;
1112     h0 += carry9 * 19;
1113     h9 -= shl64(carry9, 25);
1114     carry0 = (h0 + (1L << 25)) >> 26;
1115     h1 += carry0;
1116     h0 -= shl64(carry0, 26);
1117     h[0] = (int32_t) h0;
1118     h[1] = (int32_t) h1;
1119     h[2] = (int32_t) h2;
1120     h[3] = (int32_t) h3;
1121     h[4] = (int32_t) h4;
1122     h[5] = (int32_t) h5;
1123     h[6] = (int32_t) h6;
1124     h[7] = (int32_t) h7;
1125     h[8] = (int32_t) h8;
1126     h[9] = (int32_t) h9;
1127 }
1128
1129
1130 /*
1131 h = 2 * f * f
1132 Can overlap h with f.
1133
1134 Preconditions:
1135    |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
1136
1137 Postconditions:
1138    |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
1139 */
1140
1141 /*
1142 See fe_mul.c for discussion of implementation strategy.
1143 */
1144
1145 void fe_sq2(fe h, const fe f) {
1146     int32_t f0 = f[0];
1147     int32_t f1 = f[1];
1148     int32_t f2 = f[2];
1149     int32_t f3 = f[3];
1150     int32_t f4 = f[4];
1151     int32_t f5 = f[5];
1152     int32_t f6 = f[6];
1153     int32_t f7 = f[7];
1154     int32_t f8 = f[8];
1155     int32_t f9 = f[9];
1156     int32_t f0_2 = 2 * f0;
1157     int32_t f1_2 = 2 * f1;
1158     int32_t f2_2 = 2 * f2;
1159     int32_t f3_2 = 2 * f3;
1160     int32_t f4_2 = 2 * f4;
1161     int32_t f5_2 = 2 * f5;
1162     int32_t f6_2 = 2 * f6;
1163     int32_t f7_2 = 2 * f7;
1164     int32_t f5_38 = 38 * f5; /* 1.959375*2^30 */
1165     int32_t f6_19 = 19 * f6; /* 1.959375*2^30 */
1166     int32_t f7_38 = 38 * f7; /* 1.959375*2^30 */
1167     int32_t f8_19 = 19 * f8; /* 1.959375*2^30 */
1168     int32_t f9_38 = 38 * f9; /* 1.959375*2^30 */
1169     int64_t f0f0    = f0   * (int64_t) f0;
1170     int64_t f0f1_2  = f0_2 * (int64_t) f1;
1171     int64_t f0f2_2  = f0_2 * (int64_t) f2;
1172     int64_t f0f3_2  = f0_2 * (int64_t) f3;
1173     int64_t f0f4_2  = f0_2 * (int64_t) f4;
1174     int64_t f0f5_2  = f0_2 * (int64_t) f5;
1175     int64_t f0f6_2  = f0_2 * (int64_t) f6;
1176     int64_t f0f7_2  = f0_2 * (int64_t) f7;
1177     int64_t f0f8_2  = f0_2 * (int64_t) f8;
1178     int64_t f0f9_2  = f0_2 * (int64_t) f9;
1179     int64_t f1f1_2  = f1_2 * (int64_t) f1;
1180     int64_t f1f2_2  = f1_2 * (int64_t) f2;
1181     int64_t f1f3_4  = f1_2 * (int64_t) f3_2;
1182     int64_t f1f4_2  = f1_2 * (int64_t) f4;
1183     int64_t f1f5_4  = f1_2 * (int64_t) f5_2;
1184     int64_t f1f6_2  = f1_2 * (int64_t) f6;
1185     int64_t f1f7_4  = f1_2 * (int64_t) f7_2;
1186     int64_t f1f8_2  = f1_2 * (int64_t) f8;
1187     int64_t f1f9_76 = f1_2 * (int64_t) f9_38;
1188     int64_t f2f2    = f2   * (int64_t) f2;
1189     int64_t f2f3_2  = f2_2 * (int64_t) f3;
1190     int64_t f2f4_2  = f2_2 * (int64_t) f4;
1191     int64_t f2f5_2  = f2_2 * (int64_t) f5;
1192     int64_t f2f6_2  = f2_2 * (int64_t) f6;
1193     int64_t f2f7_2  = f2_2 * (int64_t) f7;
1194     int64_t f2f8_38 = f2_2 * (int64_t) f8_19;
1195     int64_t f2f9_38 = f2   * (int64_t) f9_38;
1196     int64_t f3f3_2  = f3_2 * (int64_t) f3;
1197     int64_t f3f4_2  = f3_2 * (int64_t) f4;
1198     int64_t f3f5_4  = f3_2 * (int64_t) f5_2;
1199     int64_t f3f6_2  = f3_2 * (int64_t) f6;
1200     int64_t f3f7_76 = f3_2 * (int64_t) f7_38;
1201     int64_t f3f8_38 = f3_2 * (int64_t) f8_19;
1202     int64_t f3f9_76 = f3_2 * (int64_t) f9_38;
1203     int64_t f4f4    = f4   * (int64_t) f4;
1204     int64_t f4f5_2  = f4_2 * (int64_t) f5;
1205     int64_t f4f6_38 = f4_2 * (int64_t) f6_19;
1206     int64_t f4f7_38 = f4   * (int64_t) f7_38;
1207     int64_t f4f8_38 = f4_2 * (int64_t) f8_19;
1208     int64_t f4f9_38 = f4   * (int64_t) f9_38;
1209     int64_t f5f5_38 = f5   * (int64_t) f5_38;
1210     int64_t f5f6_38 = f5_2 * (int64_t) f6_19;
1211     int64_t f5f7_76 = f5_2 * (int64_t) f7_38;
1212     int64_t f5f8_38 = f5_2 * (int64_t) f8_19;
1213     int64_t f5f9_76 = f5_2 * (int64_t) f9_38;
1214     int64_t f6f6_19 = f6   * (int64_t) f6_19;
1215     int64_t f6f7_38 = f6   * (int64_t) f7_38;
1216     int64_t f6f8_38 = f6_2 * (int64_t) f8_19;
1217     int64_t f6f9_38 = f6   * (int64_t) f9_38;
1218     int64_t f7f7_38 = f7   * (int64_t) f7_38;
1219     int64_t f7f8_38 = f7_2 * (int64_t) f8_19;
1220     int64_t f7f9_76 = f7_2 * (int64_t) f9_38;
1221     int64_t f8f8_19 = f8   * (int64_t) f8_19;
1222     int64_t f8f9_38 = f8   * (int64_t) f9_38;
1223     int64_t f9f9_38 = f9   * (int64_t) f9_38;
1224     int64_t h0 = f0f0  + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38;
1225     int64_t h1 = f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38;
1226     int64_t h2 = f0f2_2 + f1f1_2 + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19;
1227     int64_t h3 = f0f3_2 + f1f2_2 + f4f9_38 + f5f8_38 + f6f7_38;
1228     int64_t h4 = f0f4_2 + f1f3_4 + f2f2   + f5f9_76 + f6f8_38 + f7f7_38;
1229     int64_t h5 = f0f5_2 + f1f4_2 + f2f3_2 + f6f9_38 + f7f8_38;
1230     int64_t h6 = f0f6_2 + f1f5_4 + f2f4_2 + f3f3_2 + f7f9_76 + f8f8_19;
1231     int64_t h7 = f0f7_2 + f1f6_2 + f2f5_2 + f3f4_2 + f8f9_38;
1232     int64_t h8 = f0f8_2 + f1f7_4 + f2f6_2 + f3f5_4 + f4f4   + f9f9_38;
1233     int64_t h9 = f0f9_2 + f1f8_2 + f2f7_2 + f3f6_2 + f4f5_2;
1234     int64_t carry0;
1235     int64_t carry1;
1236     int64_t carry2;
1237     int64_t carry3;
1238     int64_t carry4;
1239     int64_t carry5;
1240     int64_t carry6;
1241     int64_t carry7;
1242     int64_t carry8;
1243     int64_t carry9;
1244     h0 += h0;
1245     h1 += h1;
1246     h2 += h2;
1247     h3 += h3;
1248     h4 += h4;
1249     h5 += h5;
1250     h6 += h6;
1251     h7 += h7;
1252     h8 += h8;
1253     h9 += h9;
1254     carry0 = (h0 + (1L << 25)) >> 26;
1255     h1 += carry0;
1256     h0 -= shl64(carry0, 26);
1257     carry4 = (h4 + (1L << 25)) >> 26;
1258     h5 += carry4;
1259     h4 -= shl64(carry4, 26);
1260     carry1 = (h1 + (1L << 24)) >> 25;
1261     h2 += carry1;
1262     h1 -= shl64(carry1, 25);
1263     carry5 = (h5 + (1L << 24)) >> 25;
1264     h6 += carry5;
1265     h5 -= shl64(carry5, 25);
1266     carry2 = (h2 + (1L << 25)) >> 26;
1267     h3 += carry2;
1268     h2 -= shl64(carry2, 26);
1269     carry6 = (h6 + (1L << 25)) >> 26;
1270     h7 += carry6;
1271     h6 -= shl64(carry6, 26);
1272     carry3 = (h3 + (1L << 24)) >> 25;
1273     h4 += carry3;
1274     h3 -= shl64(carry3, 25);
1275     carry7 = (h7 + (1L << 24)) >> 25;
1276     h8 += carry7;
1277     h7 -= shl64(carry7, 25);
1278     carry4 = (h4 + (1L << 25)) >> 26;
1279     h5 += carry4;
1280     h4 -= shl64(carry4, 26);
1281     carry8 = (h8 + (1L << 25)) >> 26;
1282     h9 += carry8;
1283     h8 -= shl64(carry8, 26);
1284     carry9 = (h9 + (1L << 24)) >> 25;
1285     h0 += carry9 * 19;
1286     h9 -= shl64(carry9, 25);
1287     carry0 = (h0 + (1L << 25)) >> 26;
1288     h1 += carry0;
1289     h0 -= shl64(carry0, 26);
1290     h[0] = (int32_t) h0;
1291     h[1] = (int32_t) h1;
1292     h[2] = (int32_t) h2;
1293     h[3] = (int32_t) h3;
1294     h[4] = (int32_t) h4;
1295     h[5] = (int32_t) h5;
1296     h[6] = (int32_t) h6;
1297     h[7] = (int32_t) h7;
1298     h[8] = (int32_t) h8;
1299     h[9] = (int32_t) h9;
1300 }
1301
1302
1303 /*
1304 h = f - g
1305 Can overlap h with f or g.
1306
1307 Preconditions:
1308    |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
1309    |g| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
1310
1311 Postconditions:
1312    |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
1313 */
1314
1315 void fe_sub(fe h, const fe f, const fe g) {
1316     int32_t f0 = f[0];
1317     int32_t f1 = f[1];
1318     int32_t f2 = f[2];
1319     int32_t f3 = f[3];
1320     int32_t f4 = f[4];
1321     int32_t f5 = f[5];
1322     int32_t f6 = f[6];
1323     int32_t f7 = f[7];
1324     int32_t f8 = f[8];
1325     int32_t f9 = f[9];
1326     int32_t g0 = g[0];
1327     int32_t g1 = g[1];
1328     int32_t g2 = g[2];
1329     int32_t g3 = g[3];
1330     int32_t g4 = g[4];
1331     int32_t g5 = g[5];
1332     int32_t g6 = g[6];
1333     int32_t g7 = g[7];
1334     int32_t g8 = g[8];
1335     int32_t g9 = g[9];
1336     int32_t h0 = f0 - g0;
1337     int32_t h1 = f1 - g1;
1338     int32_t h2 = f2 - g2;
1339     int32_t h3 = f3 - g3;
1340     int32_t h4 = f4 - g4;
1341     int32_t h5 = f5 - g5;
1342     int32_t h6 = f6 - g6;
1343     int32_t h7 = f7 - g7;
1344     int32_t h8 = f8 - g8;
1345     int32_t h9 = f9 - g9;
1346
1347     h[0] = h0;
1348     h[1] = h1;
1349     h[2] = h2;
1350     h[3] = h3;
1351     h[4] = h4;
1352     h[5] = h5;
1353     h[6] = h6;
1354     h[7] = h7;
1355     h[8] = h8;
1356     h[9] = h9;
1357 }
1358
1359
1360
1361 /*
1362 Preconditions:
1363   |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
1364
1365 Write p=2^255-19; q=floor(h/p).
1366 Basic claim: q = floor(2^(-255)(h + 19 2^(-25)h9 + 2^(-1))).
1367
1368 Proof:
1369   Have |h|<=p so |q|<=1 so |19^2 2^(-255) q|<1/4.
1370   Also have |h-2^230 h9|<2^231 so |19 2^(-255)(h-2^230 h9)|<1/4.
1371
1372   Write y=2^(-1)-19^2 2^(-255)q-19 2^(-255)(h-2^230 h9).
1373   Then 0<y<1.
1374
1375   Write r=h-pq.
1376   Have 0<=r<=p-1=2^255-20.
1377   Thus 0<=r+19(2^-255)r<r+19(2^-255)2^255<=2^255-1.
1378
1379   Write x=r+19(2^-255)r+y.
1380   Then 0<x<2^255 so floor(2^(-255)x) = 0 so floor(q+2^(-255)x) = q.
1381
1382   Have q+2^(-255)x = 2^(-255)(h + 19 2^(-25) h9 + 2^(-1))
1383   so floor(2^(-255)(h + 19 2^(-25) h9 + 2^(-1))) = q.
1384 */
1385
1386 void fe_tobytes(unsigned char *s, const fe h) {
1387     int32_t h0 = h[0];
1388     int32_t h1 = h[1];
1389     int32_t h2 = h[2];
1390     int32_t h3 = h[3];
1391     int32_t h4 = h[4];
1392     int32_t h5 = h[5];
1393     int32_t h6 = h[6];
1394     int32_t h7 = h[7];
1395     int32_t h8 = h[8];
1396     int32_t h9 = h[9];
1397     int32_t q;
1398     int32_t carry0;
1399     int32_t carry1;
1400     int32_t carry2;
1401     int32_t carry3;
1402     int32_t carry4;
1403     int32_t carry5;
1404     int32_t carry6;
1405     int32_t carry7;
1406     int32_t carry8;
1407     int32_t carry9;
1408     q = (19 * h9 + (((int32_t) 1) << 24)) >> 25;
1409     q = (h0 + q) >> 26;
1410     q = (h1 + q) >> 25;
1411     q = (h2 + q) >> 26;
1412     q = (h3 + q) >> 25;
1413     q = (h4 + q) >> 26;
1414     q = (h5 + q) >> 25;
1415     q = (h6 + q) >> 26;
1416     q = (h7 + q) >> 25;
1417     q = (h8 + q) >> 26;
1418     q = (h9 + q) >> 25;
1419     /* Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20. */
1420     h0 += 19 * q;
1421     /* Goal: Output h-2^255 q, which is between 0 and 2^255-20. */
1422     carry0 = h0 >> 26;
1423     h1 += carry0;
1424     h0 -= shl32(carry0, 26);
1425     carry1 = h1 >> 25;
1426     h2 += carry1;
1427     h1 -= shl32(carry1, 25);
1428     carry2 = h2 >> 26;
1429     h3 += carry2;
1430     h2 -= shl32(carry2, 26);
1431     carry3 = h3 >> 25;
1432     h4 += carry3;
1433     h3 -= shl32(carry3, 25);
1434     carry4 = h4 >> 26;
1435     h5 += carry4;
1436     h4 -= shl32(carry4, 26);
1437     carry5 = h5 >> 25;
1438     h6 += carry5;
1439     h5 -= shl32(carry5, 25);
1440     carry6 = h6 >> 26;
1441     h7 += carry6;
1442     h6 -= shl32(carry6, 26);
1443     carry7 = h7 >> 25;
1444     h8 += carry7;
1445     h7 -= shl32(carry7, 25);
1446     carry8 = h8 >> 26;
1447     h9 += carry8;
1448     h8 -= shl32(carry8, 26);
1449     carry9 = h9 >> 25;
1450     h9 -= shl32(carry9, 25);
1451
1452     /* h10 = carry9 */
1453     /*
1454     Goal: Output h0+...+2^255 h10-2^255 q, which is between 0 and 2^255-20.
1455     Have h0+...+2^230 h9 between 0 and 2^255-1;
1456     evidently 2^255 h10-2^255 q = 0.
1457     Goal: Output h0+...+2^230 h9.
1458     */
1459     s[0] = (unsigned char) (h0 >> 0);
1460     s[1] = (unsigned char) (h0 >> 8);
1461     s[2] = (unsigned char) (h0 >> 16);
1462     s[3] = (unsigned char) ((h0 >> 24) | shl32(h1, 2));
1463     s[4] = (unsigned char) (h1 >> 6);
1464     s[5] = (unsigned char) (h1 >> 14);
1465     s[6] = (unsigned char) ((h1 >> 22) | shl32(h2, 3));
1466     s[7] = (unsigned char) (h2 >> 5);
1467     s[8] = (unsigned char) (h2 >> 13);
1468     s[9] = (unsigned char) ((h2 >> 21) | shl32(h3, 5));
1469     s[10] = (unsigned char) (h3 >> 3);
1470     s[11] = (unsigned char) (h3 >> 11);
1471     s[12] = (unsigned char) ((h3 >> 19) | shl32(h4, 6));
1472     s[13] = (unsigned char) (h4 >> 2);
1473     s[14] = (unsigned char) (h4 >> 10);
1474     s[15] = (unsigned char) (h4 >> 18);
1475     s[16] = (unsigned char) (h5 >> 0);
1476     s[17] = (unsigned char) (h5 >> 8);
1477     s[18] = (unsigned char) (h5 >> 16);
1478     s[19] = (unsigned char) ((h5 >> 24) | shl32(h6, 1));
1479     s[20] = (unsigned char) (h6 >> 7);
1480     s[21] = (unsigned char) (h6 >> 15);
1481     s[22] = (unsigned char) ((h6 >> 23) | shl32(h7, 3));
1482     s[23] = (unsigned char) (h7 >> 5);
1483     s[24] = (unsigned char) (h7 >> 13);
1484     s[25] = (unsigned char) ((h7 >> 21) | shl32(h8, 4));
1485     s[26] = (unsigned char) (h8 >> 4);
1486     s[27] = (unsigned char) (h8 >> 12);
1487     s[28] = (unsigned char) ((h8 >> 20) | shl32(h9, 6));
1488     s[29] = (unsigned char) (h9 >> 2);
1489     s[30] = (unsigned char) (h9 >> 10);
1490     s[31] = (unsigned char) (h9 >> 18);
1491 }