Use faster addition chains for P-256 field inversion.

Switch to the addition chains by Brian Smith in
https://briansmith.org/ecc-inversion-addition-chains-01#p256_field_inversion

The new addition chains are a bit faster when measured independently.
They aren't, however, noticeable when measured with everything else in
ECDH. Rather, the motivation is just to align fiat_p256, nistz256, and a
possible future fiat_p384 import.

Since it's free, I've included the (negligible) z^-2 optimization, but
if we ever want a z^-1 abstraction, it doesn't actually matter. In the
meantime, it replaces the (even more negligible) Montgomery conversion
optimization which is a bit less odd on the EC_FELEM abstraction. (I'm
pondering whether we want an EC_AFFINE abstraction given how the Trust
Tokens DLEQ proofs work.)

fiat_p256 (64-bit):
Before:
Did 539000 P-256 get x operations in 5007148us (107646.1 ops/sec)
Did 532000 P-256 get x and y operations in 5008736us (106214.4 ops/sec)
After:
Did 607000 P-256 get x operations in 5005225us (121273.3 ops/sec)
Did 594000 P-256 get x and y operations in 5001251us (118770.3 ops/sec)

nistz256:
Before:
Did 1472000 P-256 get x operations in 5003286us (294206.6 ops/sec)
Did 1445000 P-256 get x and y operations in 5002052us (288881.4 ops/sec)
After:
Did 1491000 P-256 get x operations in 5002524us (298049.5 ops/sec)
Did 1452000 P-256 get x and y operations in 5003193us (290214.7 ops/sec)

I haven't bothered checking in the benchmarks as those operations
standalone are largely artificial. They're a consequence of using the
same type for affine and Jacobian points.

Change-Id: I71e0d50a8712198f9cb8f68d50894d14a6091635
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/40867
Reviewed-by: David Benjamin <davidben@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/ec/p256-x86_64.c b/crypto/fipsmodule/ec/p256-x86_64.c
index 9f5bf87..cb82852 100644
--- a/crypto/fipsmodule/ec/p256-x86_64.c
+++ b/crypto/fipsmodule/ec/p256-x86_64.c
@@ -117,86 +117,73 @@
   return in;
 }
 
-// ecp_nistz256_mod_inverse_mont sets |r| to (|in| * 2^-256)^-1 * 2^256 mod p.
-// That is, |r| is the modular inverse of |in| for input and output in the
-// Montgomery domain.
-static void ecp_nistz256_mod_inverse_mont(BN_ULONG r[P256_LIMBS],
-                                          const BN_ULONG in[P256_LIMBS]) {
-  /* The poly is ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff
-     ffffffff
-     We use FLT and used poly-2 as exponent */
-  BN_ULONG p2[P256_LIMBS];
-  BN_ULONG p4[P256_LIMBS];
-  BN_ULONG p8[P256_LIMBS];
-  BN_ULONG p16[P256_LIMBS];
-  BN_ULONG p32[P256_LIMBS];
-  BN_ULONG res[P256_LIMBS];
-  int i;
+// ecp_nistz256_mod_inverse_sqr_mont sets |r| to (|in| * 2^-256)^-2 * 2^256 mod
+// p. That is, |r| is the modular inverse square of |in| for input and output in
+// the Montgomery domain.
+static void ecp_nistz256_mod_inverse_sqr_mont(BN_ULONG r[P256_LIMBS],
+                                              const BN_ULONG in[P256_LIMBS]) {
+  // This implements the addition chain described in
+  // https://briansmith.org/ecc-inversion-addition-chains-01#p256_field_inversion
+  BN_ULONG x2[P256_LIMBS], x3[P256_LIMBS], x6[P256_LIMBS], x12[P256_LIMBS],
+      x15[P256_LIMBS], x30[P256_LIMBS], x32[P256_LIMBS];
+  ecp_nistz256_sqr_mont(x2, in);      // 2^2 - 2^1
+  ecp_nistz256_mul_mont(x2, x2, in);  // 2^2 - 2^0
 
-  ecp_nistz256_sqr_mont(res, in);
-  ecp_nistz256_mul_mont(p2, res, in);  // 3*p
+  ecp_nistz256_sqr_mont(x3, x2);      // 2^3 - 2^1
+  ecp_nistz256_mul_mont(x3, x3, in);  // 2^3 - 2^0
 
-  ecp_nistz256_sqr_mont(res, p2);
-  ecp_nistz256_sqr_mont(res, res);
-  ecp_nistz256_mul_mont(p4, res, p2);  // f*p
+  ecp_nistz256_sqr_mont(x6, x3);
+  for (int i = 1; i < 3; i++) {
+    ecp_nistz256_sqr_mont(x6, x6);
+  }                                   // 2^6 - 2^3
+  ecp_nistz256_mul_mont(x6, x6, x3);  // 2^6 - 2^0
 
-  ecp_nistz256_sqr_mont(res, p4);
-  ecp_nistz256_sqr_mont(res, res);
-  ecp_nistz256_sqr_mont(res, res);
-  ecp_nistz256_sqr_mont(res, res);
-  ecp_nistz256_mul_mont(p8, res, p4);  // ff*p
+  ecp_nistz256_sqr_mont(x12, x6);
+  for (int i = 1; i < 6; i++) {
+    ecp_nistz256_sqr_mont(x12, x12);
+  }                                     // 2^12 - 2^6
+  ecp_nistz256_mul_mont(x12, x12, x6);  // 2^12 - 2^0
 
-  ecp_nistz256_sqr_mont(res, p8);
-  for (i = 0; i < 7; i++) {
-    ecp_nistz256_sqr_mont(res, res);
-  }
-  ecp_nistz256_mul_mont(p16, res, p8);  // ffff*p
+  ecp_nistz256_sqr_mont(x15, x12);
+  for (int i = 1; i < 3; i++) {
+    ecp_nistz256_sqr_mont(x15, x15);
+  }                                     // 2^15 - 2^3
+  ecp_nistz256_mul_mont(x15, x15, x3);  // 2^15 - 2^0
 
-  ecp_nistz256_sqr_mont(res, p16);
-  for (i = 0; i < 15; i++) {
-    ecp_nistz256_sqr_mont(res, res);
-  }
-  ecp_nistz256_mul_mont(p32, res, p16);  // ffffffff*p
+  ecp_nistz256_sqr_mont(x30, x15);
+  for (int i = 1; i < 15; i++) {
+    ecp_nistz256_sqr_mont(x30, x30);
+  }                                      // 2^30 - 2^15
+  ecp_nistz256_mul_mont(x30, x30, x15);  // 2^30 - 2^0
 
-  ecp_nistz256_sqr_mont(res, p32);
-  for (i = 0; i < 31; i++) {
-    ecp_nistz256_sqr_mont(res, res);
-  }
-  ecp_nistz256_mul_mont(res, res, in);
+  ecp_nistz256_sqr_mont(x32, x30);
+  ecp_nistz256_sqr_mont(x32, x32);      // 2^32 - 2^2
+  ecp_nistz256_mul_mont(x32, x32, x2);  // 2^32 - 2^0
 
-  for (i = 0; i < 32 * 4; i++) {
-    ecp_nistz256_sqr_mont(res, res);
-  }
-  ecp_nistz256_mul_mont(res, res, p32);
+  BN_ULONG ret[P256_LIMBS];
+  ecp_nistz256_sqr_mont(ret, x32);
+  for (int i = 1; i < 31 + 1; i++) {
+    ecp_nistz256_sqr_mont(ret, ret);
+  }                                     // 2^64 - 2^32
+  ecp_nistz256_mul_mont(ret, ret, in);  // 2^64 - 2^32 + 2^0
 
-  for (i = 0; i < 32; i++) {
-    ecp_nistz256_sqr_mont(res, res);
-  }
-  ecp_nistz256_mul_mont(res, res, p32);
+  for (int i = 0; i < 96 + 32; i++) {
+    ecp_nistz256_sqr_mont(ret, ret);
+  }                                      // 2^192 - 2^160 + 2^128
+  ecp_nistz256_mul_mont(ret, ret, x32);  // 2^192 - 2^160 + 2^128 + 2^32 - 2^0
 
-  for (i = 0; i < 16; i++) {
-    ecp_nistz256_sqr_mont(res, res);
-  }
-  ecp_nistz256_mul_mont(res, res, p16);
+  for (int i = 0; i < 32; i++) {
+    ecp_nistz256_sqr_mont(ret, ret);
+  }                                      // 2^224 - 2^192 + 2^160 + 2^64 - 2^32
+  ecp_nistz256_mul_mont(ret, ret, x32);  // 2^224 - 2^192 + 2^160 + 2^64 - 2^0
 
-  for (i = 0; i < 8; i++) {
-    ecp_nistz256_sqr_mont(res, res);
-  }
-  ecp_nistz256_mul_mont(res, res, p8);
+  for (int i = 0; i < 30; i++) {
+    ecp_nistz256_sqr_mont(ret, ret);
+  }                                      // 2^254 - 2^222 + 2^190 + 2^94 - 2^30
+  ecp_nistz256_mul_mont(ret, ret, x30);  // 2^254 - 2^222 + 2^190 + 2^94 - 2^0
 
-  ecp_nistz256_sqr_mont(res, res);
-  ecp_nistz256_sqr_mont(res, res);
-  ecp_nistz256_sqr_mont(res, res);
-  ecp_nistz256_sqr_mont(res, res);
-  ecp_nistz256_mul_mont(res, res, p4);
-
-  ecp_nistz256_sqr_mont(res, res);
-  ecp_nistz256_sqr_mont(res, res);
-  ecp_nistz256_mul_mont(res, res, p2);
-
-  ecp_nistz256_sqr_mont(res, res);
-  ecp_nistz256_sqr_mont(res, res);
-  ecp_nistz256_mul_mont(r, res, in);
+  ecp_nistz256_sqr_mont(ret, ret);
+  ecp_nistz256_sqr_mont(r, ret);  // 2^256 - 2^224 + 2^192 + 2^96 - 2^2
 }
 
 // r = p * p_scalar
@@ -440,24 +427,19 @@
   }
 
   BN_ULONG z_inv2[P256_LIMBS];
-  BN_ULONG z_inv3[P256_LIMBS];
   assert(group->field.width == P256_LIMBS);
-  ecp_nistz256_mod_inverse_mont(z_inv3, point->Z.words);
-  ecp_nistz256_sqr_mont(z_inv2, z_inv3);
-
-  // Instead of using |ecp_nistz256_from_mont| to convert the |x| coordinate
-  // and then calling |ecp_nistz256_from_mont| again to convert the |y|
-  // coordinate below, convert the common factor |z_inv2| once now, saving one
-  // reduction.
-  ecp_nistz256_from_mont(z_inv2, z_inv2);
+  ecp_nistz256_mod_inverse_sqr_mont(z_inv2, point->Z.words);
 
   if (x != NULL) {
     ecp_nistz256_mul_mont(x->words, z_inv2, point->X.words);
+    ecp_nistz256_from_mont(x->words, x->words);
   }
 
   if (y != NULL) {
-    ecp_nistz256_mul_mont(z_inv3, z_inv3, z_inv2);
-    ecp_nistz256_mul_mont(y->words, z_inv3, point->Y.words);
+    ecp_nistz256_sqr_mont(z_inv2, z_inv2);                            // z^-4
+    ecp_nistz256_mul_mont(y->words, point->Y.words, point->Z.words);  // y * z
+    ecp_nistz256_mul_mont(y->words, y->words, z_inv2);  // y * z^-3
+    ecp_nistz256_from_mont(y->words, y->words);
   }
 
   return 1;
diff --git a/crypto/fipsmodule/ec/p256.c b/crypto/fipsmodule/ec/p256.c
index 9abac0b..bf9e64e 100644
--- a/crypto/fipsmodule/ec/p256.c
+++ b/crypto/fipsmodule/ec/p256.c
@@ -91,70 +91,75 @@
   fiat_p256_to_bytes(out->bytes, in);
 }
 
-// fiat_p256_inv calculates |out| = |in|^{-1}
+// fiat_p256_inv_square calculates |out| = |in|^{-2}
 //
 // Based on Fermat's Little Theorem:
 //   a^p = a (mod p)
 //   a^{p-1} = 1 (mod p)
-//   a^{p-2} = a^{-1} (mod p)
-static void fiat_p256_inv(fiat_p256_felem out, const fiat_p256_felem in) {
-  fiat_p256_felem ftmp, ftmp2;
-  // each e_I will hold |in|^{2^I - 1}
-  fiat_p256_felem e2, e4, e8, e16, e32, e64;
+//   a^{p-3} = a^{-2} (mod p)
+static void fiat_p256_inv_square(fiat_p256_felem out,
+                                 const fiat_p256_felem in) {
+  // This implements the addition chain described in
+  // https://briansmith.org/ecc-inversion-addition-chains-01#p256_field_inversion
+  fiat_p256_felem x2, x3, x6, x12, x15, x30, x32;
+  fiat_p256_square(x2, in);   // 2^2 - 2^1
+  fiat_p256_mul(x2, x2, in);  // 2^2 - 2^0
 
-  fiat_p256_square(ftmp, in);     // 2^1
-  fiat_p256_mul(ftmp, in, ftmp);  // 2^2 - 2^0
-  fiat_p256_copy(e2, ftmp);
-  fiat_p256_square(ftmp, ftmp);   // 2^3 - 2^1
-  fiat_p256_square(ftmp, ftmp);   // 2^4 - 2^2
-  fiat_p256_mul(ftmp, ftmp, e2);  // 2^4 - 2^0
-  fiat_p256_copy(e4, ftmp);
-  fiat_p256_square(ftmp, ftmp);   // 2^5 - 2^1
-  fiat_p256_square(ftmp, ftmp);   // 2^6 - 2^2
-  fiat_p256_square(ftmp, ftmp);   // 2^7 - 2^3
-  fiat_p256_square(ftmp, ftmp);   // 2^8 - 2^4
-  fiat_p256_mul(ftmp, ftmp, e4);  // 2^8 - 2^0
-  fiat_p256_copy(e8, ftmp);
-  for (size_t i = 0; i < 8; i++) {
-    fiat_p256_square(ftmp, ftmp);
-  }                               // 2^16 - 2^8
-  fiat_p256_mul(ftmp, ftmp, e8);  // 2^16 - 2^0
-  fiat_p256_copy(e16, ftmp);
-  for (size_t i = 0; i < 16; i++) {
-    fiat_p256_square(ftmp, ftmp);
-  }                                // 2^32 - 2^16
-  fiat_p256_mul(ftmp, ftmp, e16);  // 2^32 - 2^0
-  fiat_p256_copy(e32, ftmp);
-  for (size_t i = 0; i < 32; i++) {
-    fiat_p256_square(ftmp, ftmp);
-  }  // 2^64 - 2^32
-  fiat_p256_copy(e64, ftmp);
-  fiat_p256_mul(ftmp, ftmp, in);  // 2^64 - 2^32 + 2^0
-  for (size_t i = 0; i < 192; i++) {
-    fiat_p256_square(ftmp, ftmp);
-  }  // 2^256 - 2^224 + 2^192
+  fiat_p256_square(x3, x2);   // 2^3 - 2^1
+  fiat_p256_mul(x3, x3, in);  // 2^3 - 2^0
 
-  fiat_p256_mul(ftmp2, e64, e32);  // 2^64 - 2^0
-  for (size_t i = 0; i < 16; i++) {
-    fiat_p256_square(ftmp2, ftmp2);
-  }                                  // 2^80 - 2^16
-  fiat_p256_mul(ftmp2, ftmp2, e16);  // 2^80 - 2^0
-  for (size_t i = 0; i < 8; i++) {
-    fiat_p256_square(ftmp2, ftmp2);
-  }                                 // 2^88 - 2^8
-  fiat_p256_mul(ftmp2, ftmp2, e8);  // 2^88 - 2^0
-  for (size_t i = 0; i < 4; i++) {
-    fiat_p256_square(ftmp2, ftmp2);
-  }                                 // 2^92 - 2^4
-  fiat_p256_mul(ftmp2, ftmp2, e4);  // 2^92 - 2^0
-  fiat_p256_square(ftmp2, ftmp2);   // 2^93 - 2^1
-  fiat_p256_square(ftmp2, ftmp2);   // 2^94 - 2^2
-  fiat_p256_mul(ftmp2, ftmp2, e2);  // 2^94 - 2^0
-  fiat_p256_square(ftmp2, ftmp2);   // 2^95 - 2^1
-  fiat_p256_square(ftmp2, ftmp2);   // 2^96 - 2^2
-  fiat_p256_mul(ftmp2, ftmp2, in);  // 2^96 - 3
+  fiat_p256_square(x6, x3);
+  for (int i = 1; i < 3; i++) {
+    fiat_p256_square(x6, x6);
+  }                           // 2^6 - 2^3
+  fiat_p256_mul(x6, x6, x3);  // 2^6 - 2^0
 
-  fiat_p256_mul(out, ftmp2, ftmp);  // 2^256 - 2^224 + 2^192 + 2^96 - 3
+  fiat_p256_square(x12, x6);
+  for (int i = 1; i < 6; i++) {
+    fiat_p256_square(x12, x12);
+  }                             // 2^12 - 2^6
+  fiat_p256_mul(x12, x12, x6);  // 2^12 - 2^0
+
+  fiat_p256_square(x15, x12);
+  for (int i = 1; i < 3; i++) {
+    fiat_p256_square(x15, x15);
+  }                             // 2^15 - 2^3
+  fiat_p256_mul(x15, x15, x3);  // 2^15 - 2^0
+
+  fiat_p256_square(x30, x15);
+  for (int i = 1; i < 15; i++) {
+    fiat_p256_square(x30, x30);
+  }                              // 2^30 - 2^15
+  fiat_p256_mul(x30, x30, x15);  // 2^30 - 2^0
+
+  fiat_p256_square(x32, x30);
+  fiat_p256_square(x32, x32);   // 2^32 - 2^2
+  fiat_p256_mul(x32, x32, x2);  // 2^32 - 2^0
+
+  fiat_p256_felem ret;
+  fiat_p256_square(ret, x32);
+  for (int i = 1; i < 31 + 1; i++) {
+    fiat_p256_square(ret, ret);
+  }                             // 2^64 - 2^32
+  fiat_p256_mul(ret, ret, in);  // 2^64 - 2^32 + 2^0
+
+  for (int i = 0; i < 96 + 32; i++) {
+    fiat_p256_square(ret, ret);
+  }                              // 2^192 - 2^160 + 2^128
+  fiat_p256_mul(ret, ret, x32);  // 2^192 - 2^160 + 2^128 + 2^32 - 2^0
+
+  for (int i = 0; i < 32; i++) {
+    fiat_p256_square(ret, ret);
+  }                              // 2^224 - 2^192 + 2^160 + 2^64 - 2^32
+  fiat_p256_mul(ret, ret, x32);  // 2^224 - 2^192 + 2^160 + 2^64 - 2^0
+
+  for (int i = 0; i < 30; i++) {
+    fiat_p256_square(ret, ret);
+  }                              // 2^254 - 2^222 + 2^190 + 2^94 - 2^30
+  fiat_p256_mul(ret, ret, x30);  // 2^254 - 2^222 + 2^190 + 2^94 - 2^0
+
+  fiat_p256_square(ret, ret);
+  fiat_p256_square(out, ret);  // 2^256 - 2^224 + 2^192 + 2^96 - 2^2
 }
 
 // Group operations
@@ -741,27 +746,23 @@
 
   fiat_p256_felem z1, z2;
   fiat_p256_from_generic(z1, &point->Z);
-  fiat_p256_inv(z2, z1);
-  fiat_p256_square(z1, z2);
-
-  // Instead of using |fiat_p256_from_montgomery| to convert the |x| coordinate
-  // and then calling |fiat_p256_from_montgomery| again to convert the |y|
-  // coordinate below, convert the common factor |z1| once now, saving one
-  // reduction.
-  fiat_p256_from_montgomery(z1, z1);
+  fiat_p256_inv_square(z2, z1);
 
   if (x_out != NULL) {
     fiat_p256_felem x;
     fiat_p256_from_generic(x, &point->X);
-    fiat_p256_mul(x, x, z1);
+    fiat_p256_mul(x, x, z2);
+    fiat_p256_from_montgomery(x, x);
     fiat_p256_to_generic(x_out, x);
   }
 
   if (y_out != NULL) {
     fiat_p256_felem y;
     fiat_p256_from_generic(y, &point->Y);
-    fiat_p256_mul(z1, z1, z2);
-    fiat_p256_mul(y, y, z1);
+    fiat_p256_square(z2, z2);  // z^-4
+    fiat_p256_mul(y, y, z1);   // y * z
+    fiat_p256_mul(y, y, z2);   // y * z^-3
+    fiat_p256_from_montgomery(y, y);
     fiat_p256_to_generic(y_out, y);
   }