Import Bedrock code for P-256 point double, add

This code is currently only used in the p256.cc.inc (not nistz), but it
uses formulas similar to the nistz implementation. In particular, point
doubling is implemented using coordinate halving instead of multiplying
denominators by powers of two.

This change also removes the affine-addition optimization, counteracting
some of the efficiency gains but still resulting in an overall speedup
for client-side usage. This removal is intended to be temporary. The
verified affine-addition function uses a slightly different (nistz-like)
calling convention. Later work will unify the p256 and p256-nistz
files behind the new calling convention and restore the optimization
for both small and large tables.

Benchmarks on Zen 4:

../main/build-small/bssl.json
Did 14400 ECDH P-256 operations in 1000289us (14395.8 ops/sec)
Did 37000 ECDSA P-256 signing operations in 1009883us (36637.9 ops/sec)
Did 16000 ECDSA P-256 verify operations in 1003734us (15940.5 ops/sec)

build-small/bssl.json
Did 15886 ECDH P-256 operations in 1000894us (15871.8 ops/sec) [+10.3%]
Did 36000 ECDSA P-256 signing operations in 1017983us (35364.0 ops/sec) [-3.5%]
Did 18000 ECDSA P-256 verify operations in 1045215us (17221.3 ops/sec) [+8.0%]

P-256 ECDH is still ~40% slower in the small build than full build.
Estimating with double = 0.6 add, table size alone would yield ~30%.

Change-Id: Ide90296f9e14bf543df4e397f088602942c0b658
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/79787
Reviewed-by: David Benjamin <davidben@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
Auto-Submit: Andres Erbsen <andreser@google.com>
diff --git a/build.json b/build.json
index 152a991..39ed119 100644
--- a/build.json
+++ b/build.json
@@ -566,7 +566,10 @@
             "third_party/fiat/p256_32.h",
             "third_party/fiat/p256_64.h",
             "third_party/fiat/p256_64_msvc.h",
-            "third_party/fiat/p256_bedrock.c.inc",
+            "third_party/fiat/p256_field.c.inc",
+	    "third_party/fiat/p256_field_32.br.c.inc",
+	    "third_party/fiat/p256_field_64.br.c.inc",
+	    "third_party/fiat/p256_point.br.c.inc",
             "third_party/fiat/bedrock_unverified_bareminimum.c.inc",
             "third_party/fiat/bedrock_unverified_platform.c.inc"
         ],
diff --git a/crypto/fipsmodule/ec/p256.cc.inc b/crypto/fipsmodule/ec/p256.cc.inc
index 6a5f2d3..6a8abcd 100644
--- a/crypto/fipsmodule/ec/p256.cc.inc
+++ b/crypto/fipsmodule/ec/p256.cc.inc
@@ -30,14 +30,8 @@
 #include "../delocate.h"
 #include "./internal.h"
 
-#if defined(BORINGSSL_HAS_UINT128)
-#include "../../../third_party/fiat/p256_64.h"
-#elif defined(OPENSSL_64_BIT)
-#include "../../../third_party/fiat/p256_64_msvc.h"
-#else
-#include "../../../third_party/fiat/p256_32.h"
-#endif
-
+#include "../../../third_party/fiat/p256_field.c.inc"
+#include "../../../third_party/fiat/p256_point.br.c.inc"
 
 // utility functions, handwritten
 
@@ -56,13 +50,6 @@
 #endif  // 64BIT
 
 
-static fiat_p256_limb_t fiat_p256_nz(
-    const fiat_p256_limb_t in1[FIAT_P256_NLIMBS]) {
-  fiat_p256_limb_t ret;
-  fiat_p256_nonzero(&ret, in1);
-  return ret;
-}
-
 static void fiat_p256_copy(fiat_p256_limb_t out[FIAT_P256_NLIMBS],
                            const fiat_p256_limb_t in1[FIAT_P256_NLIMBS]) {
   for (size_t i = 0; i < FIAT_P256_NLIMBS; i++) {
@@ -172,198 +159,44 @@
 // Building on top of the field operations we have the operations on the
 // elliptic curve group itself. Points on the curve are represented in Jacobian
 // coordinates.
-//
-// Both operations were transcribed to Coq and proven to correspond to naive
-// implementations using Affine coordinates, for all suitable fields.  In the
-// Coq proofs, issues of constant-time execution and memory layout (aliasing)
-// conventions were not considered. Specification of affine coordinates:
-// <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Spec/WeierstrassCurve.v#L28>
-// As a sanity check, a proof that these points form a commutative group:
-// <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/AffineProofs.v#L33>
 
-// fiat_p256_point_double calculates 2*(x_in, y_in, z_in)
-//
-// The method is taken from:
-//   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
-//
-// Coq transcription and correctness proof:
-// <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/Jacobian.v#L93>
-// <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/Jacobian.v#L201>
-//
-// Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
-// while x_out == y_in is not (maybe this works, but it's not tested).
 static void fiat_p256_point_double(fiat_p256_felem x_out, fiat_p256_felem y_out,
                                    fiat_p256_felem z_out,
                                    const fiat_p256_felem x_in,
                                    const fiat_p256_felem y_in,
                                    const fiat_p256_felem z_in) {
-  fiat_p256_felem delta, gamma, beta, ftmp, ftmp2, tmptmp, alpha, fourbeta;
-  // delta = z^2
-  fiat_p256_square(delta, z_in);
-  // gamma = y^2
-  fiat_p256_square(gamma, y_in);
-  // beta = x*gamma
-  fiat_p256_mul(beta, x_in, gamma);
-
-  // alpha = 3*(x-delta)*(x+delta)
-  fiat_p256_sub(ftmp, x_in, delta);
-  fiat_p256_add(ftmp2, x_in, delta);
-
-  fiat_p256_add(tmptmp, ftmp2, ftmp2);
-  fiat_p256_add(ftmp2, ftmp2, tmptmp);
-  fiat_p256_mul(alpha, ftmp, ftmp2);
-
-  // x' = alpha^2 - 8*beta
-  fiat_p256_square(x_out, alpha);
-  fiat_p256_add(fourbeta, beta, beta);
-  fiat_p256_add(fourbeta, fourbeta, fourbeta);
-  fiat_p256_add(tmptmp, fourbeta, fourbeta);
-  fiat_p256_sub(x_out, x_out, tmptmp);
-
-  // z' = (y + z)^2 - gamma - delta
-  fiat_p256_add(delta, gamma, delta);
-  fiat_p256_add(ftmp, y_in, z_in);
-  fiat_p256_square(z_out, ftmp);
-  fiat_p256_sub(z_out, z_out, delta);
-
-  // y' = alpha*(4*beta - x') - 8*gamma^2
-  fiat_p256_sub(y_out, fourbeta, x_out);
-  fiat_p256_add(gamma, gamma, gamma);
-  fiat_p256_square(gamma, gamma);
-  fiat_p256_mul(y_out, alpha, y_out);
-  fiat_p256_add(gamma, gamma, gamma);
-  fiat_p256_sub(y_out, y_out, gamma);
+  uint8_t out[3*32], in[3*32];
+  static_assert(sizeof(fiat_p256_felem) == 32);
+  OPENSSL_memcpy(&in[0], x_in, 32);
+  OPENSSL_memcpy(&in[32], y_in, 32);
+  OPENSSL_memcpy(&in[64], z_in, 32);
+  p256_point_double((br_word_t)out, (br_word_t)in);
+  OPENSSL_memcpy(x_out, &out[0], 32);
+  OPENSSL_memcpy(y_out, &out[32], 32);
+  OPENSSL_memcpy(z_out, &out[64], 32);
 }
 
-// fiat_p256_point_add calculates (x1, y1, z1) + (x2, y2, z2)
-//
-// The method is taken from:
-//   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
-// adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
-//
-// Coq transcription and correctness proof:
-// <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/Jacobian.v#L135>
-// <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/Jacobian.v#L205>
-//
-// This function includes a branch for checking whether the two input points
-// are equal, (while not equal to the point at infinity). This case never
-// happens during single point multiplication, so there is no timing leak for
-// ECDH or ECDSA signing.
 static void fiat_p256_point_add(fiat_p256_felem x3, fiat_p256_felem y3,
                                 fiat_p256_felem z3, const fiat_p256_felem x1,
                                 const fiat_p256_felem y1,
-                                const fiat_p256_felem z1, const int mixed,
+                                const fiat_p256_felem z1,
                                 const fiat_p256_felem x2,
                                 const fiat_p256_felem y2,
                                 const fiat_p256_felem z2) {
-  fiat_p256_felem x_out, y_out, z_out;
-  fiat_p256_limb_t z1nz = fiat_p256_nz(z1);
-  fiat_p256_limb_t z2nz = fiat_p256_nz(z2);
-
-  // z1z1 = z1z1 = z1**2
-  fiat_p256_felem z1z1;
-  fiat_p256_square(z1z1, z1);
-
-  fiat_p256_felem u1, s1, two_z1z2;
-  if (!mixed) {
-    // z2z2 = z2**2
-    fiat_p256_felem z2z2;
-    fiat_p256_square(z2z2, z2);
-
-    // u1 = x1*z2z2
-    fiat_p256_mul(u1, x1, z2z2);
-
-    // two_z1z2 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2
-    fiat_p256_add(two_z1z2, z1, z2);
-    fiat_p256_square(two_z1z2, two_z1z2);
-    fiat_p256_sub(two_z1z2, two_z1z2, z1z1);
-    fiat_p256_sub(two_z1z2, two_z1z2, z2z2);
-
-    // s1 = y1 * z2**3
-    fiat_p256_mul(s1, z2, z2z2);
-    fiat_p256_mul(s1, s1, y1);
-  } else {
-    // We'll assume z2 = 1 (special case z2 = 0 is handled later).
-
-    // u1 = x1*z2z2
-    fiat_p256_copy(u1, x1);
-    // two_z1z2 = 2z1z2
-    fiat_p256_add(two_z1z2, z1, z1);
-    // s1 = y1 * z2**3
-    fiat_p256_copy(s1, y1);
-  }
-
-  // u2 = x2*z1z1
-  fiat_p256_felem u2;
-  fiat_p256_mul(u2, x2, z1z1);
-
-  // h = u2 - u1
-  fiat_p256_felem h;
-  fiat_p256_sub(h, u2, u1);
-
-  fiat_p256_limb_t xneq = fiat_p256_nz(h);
-
-  // z_out = two_z1z2 * h
-  fiat_p256_mul(z_out, h, two_z1z2);
-
-  // z1z1z1 = z1 * z1z1
-  fiat_p256_felem z1z1z1;
-  fiat_p256_mul(z1z1z1, z1, z1z1);
-
-  // s2 = y2 * z1**3
-  fiat_p256_felem s2;
-  fiat_p256_mul(s2, y2, z1z1z1);
-
-  // r = (s2 - s1)*2
-  fiat_p256_felem r;
-  fiat_p256_sub(r, s2, s1);
-  fiat_p256_add(r, r, r);
-
-  fiat_p256_limb_t yneq = fiat_p256_nz(r);
-
-  fiat_p256_limb_t is_nontrivial_double = constant_time_is_zero_w(xneq | yneq) &
-                                          ~constant_time_is_zero_w(z1nz) &
-                                          ~constant_time_is_zero_w(z2nz);
-  if (constant_time_declassify_w(is_nontrivial_double)) {
-    fiat_p256_point_double(x3, y3, z3, x1, y1, z1);
-    return;
-  }
-
-  // I = (2h)**2
-  fiat_p256_felem i;
-  fiat_p256_add(i, h, h);
-  fiat_p256_square(i, i);
-
-  // J = h * I
-  fiat_p256_felem j;
-  fiat_p256_mul(j, h, i);
-
-  // V = U1 * I
-  fiat_p256_felem v;
-  fiat_p256_mul(v, u1, i);
-
-  // x_out = r**2 - J - 2V
-  fiat_p256_square(x_out, r);
-  fiat_p256_sub(x_out, x_out, j);
-  fiat_p256_sub(x_out, x_out, v);
-  fiat_p256_sub(x_out, x_out, v);
-
-  // y_out = r(V-x_out) - 2 * s1 * J
-  fiat_p256_sub(y_out, v, x_out);
-  fiat_p256_mul(y_out, y_out, r);
-  fiat_p256_felem s1j;
-  fiat_p256_mul(s1j, s1, j);
-  fiat_p256_sub(y_out, y_out, s1j);
-  fiat_p256_sub(y_out, y_out, s1j);
-
-  fiat_p256_cmovznz(x_out, z1nz, x2, x_out);
-  fiat_p256_cmovznz(x3, z2nz, x1, x_out);
-  fiat_p256_cmovznz(y_out, z1nz, y2, y_out);
-  fiat_p256_cmovznz(y3, z2nz, y1, y_out);
-  fiat_p256_cmovznz(z_out, z1nz, z2, z_out);
-  fiat_p256_cmovznz(z3, z2nz, z1, z_out);
+  uint8_t out[3 * 32], in1[3 * 32], in2[3 * 32];
+  static_assert(sizeof(fiat_p256_felem) == 32);
+  OPENSSL_memcpy(&in1[0], x1, 32);
+  OPENSSL_memcpy(&in1[32], y1, 32);
+  OPENSSL_memcpy(&in1[64], z1, 32);
+  OPENSSL_memcpy(&in2[0], x2, 32);
+  OPENSSL_memcpy(&in2[32], y2, 32);
+  OPENSSL_memcpy(&in2[64], z2, 32);
+  p256_point_add_vartime_if_doubling((br_word_t)out, (br_word_t)in1,
+                                     (br_word_t)in2);
+  OPENSSL_memcpy(x3, &out[0], 32);
+  OPENSSL_memcpy(y3, &out[32], 32);
+  OPENSSL_memcpy(z3, &out[64], 32);
 }
-
 #include "./p256_table.h"
 
 // fiat_p256_select_point_affine selects the |idx-1|th point from a
@@ -454,8 +287,7 @@
   fiat_p256_from_generic(x2, &b->X);
   fiat_p256_from_generic(y2, &b->Y);
   fiat_p256_from_generic(z2, &b->Z);
-  fiat_p256_point_add(x1, y1, z1, x1, y1, z1, 0 /* both Jacobian */, x2, y2,
-                      z2);
+  fiat_p256_point_add(x1, y1, z1, x1, y1, z1, x2, y2, z2);
   fiat_p256_to_generic(&r->X, x1);
   fiat_p256_to_generic(&r->Y, y1);
   fiat_p256_to_generic(&r->Z, z1);
@@ -486,7 +318,7 @@
     if (j & 1) {
       fiat_p256_point_add(p_pre_comp[j][0], p_pre_comp[j][1], p_pre_comp[j][2],
                           p_pre_comp[1][0], p_pre_comp[1][1], p_pre_comp[1][2],
-                          0, p_pre_comp[j - 1][0], p_pre_comp[j - 1][1],
+                          p_pre_comp[j - 1][0], p_pre_comp[j - 1][1],
                           p_pre_comp[j - 1][2]);
     } else {
       fiat_p256_point_double(p_pre_comp[j][0], p_pre_comp[j][1],
@@ -524,8 +356,8 @@
       fiat_p256_cmovznz(tmp[1], (fiat_p256_limb_t)sign, tmp[1], ftmp);
 
       if (!skip) {
-        fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2],
-                            0 /* mixed */, tmp[0], tmp[1], tmp[2]);
+        fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], tmp[0],
+                            tmp[1], tmp[2]);
       } else {
         fiat_p256_copy(nq[0], tmp[0]);
         fiat_p256_copy(nq[1], tmp[1]);
@@ -562,8 +394,8 @@
                                   fiat_p256_g_pre_comp[1], tmp);
 
     if (!skip) {
-      fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2],
-                          1 /* mixed */, tmp[0], tmp[1], tmp[2]);
+      fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], tmp[0],
+                          tmp[1], tmp[2]);
     } else {
       fiat_p256_copy(nq[0], tmp[0]);
       fiat_p256_copy(nq[1], tmp[1]);
@@ -579,8 +411,8 @@
     // Select the point to add, in constant time.
     fiat_p256_select_point_affine((fiat_p256_limb_t)bits, 15,
                                   fiat_p256_g_pre_comp[0], tmp);
-    fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1 /* mixed */,
-                        tmp[0], tmp[1], tmp[2]);
+    fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], tmp[0],
+                        tmp[1], tmp[2]);
   }
 
   fiat_p256_to_generic(&r->X, nq[0]);
@@ -605,8 +437,7 @@
   for (size_t i = 1; i < OPENSSL_ARRAY_SIZE(p_pre_comp); i++) {
     fiat_p256_point_add(p_pre_comp[i][0], p_pre_comp[i][1], p_pre_comp[i][2],
                         p_pre_comp[i - 1][0], p_pre_comp[i - 1][1],
-                        p_pre_comp[i - 1][2], 0 /* not mixed */, p2[0], p2[1],
-                        p2[2]);
+                        p_pre_comp[i - 1][2], p2[0], p2[1], p2[2]);
   }
 
   // Set up the coefficients for |p_scalar|.
@@ -632,9 +463,8 @@
       if (bits != 0) {
         size_t index = (size_t)(bits - 1);
         fiat_p256_point_add(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2],
-                            1 /* mixed */, fiat_p256_g_pre_comp[1][index][0],
-                            fiat_p256_g_pre_comp[1][index][1],
-                            fiat_p256_one);
+                            fiat_p256_g_pre_comp[1][index][0],
+                            fiat_p256_g_pre_comp[1][index][1], fiat_p256_one);
         skip = 0;
       }
 
@@ -646,9 +476,8 @@
       if (bits != 0) {
         size_t index = (size_t)(bits - 1);
         fiat_p256_point_add(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2],
-                            1 /* mixed */, fiat_p256_g_pre_comp[0][index][0],
-                            fiat_p256_g_pre_comp[0][index][1],
-                            fiat_p256_one);
+                            fiat_p256_g_pre_comp[0][index][0],
+                            fiat_p256_g_pre_comp[0][index][1], fiat_p256_one);
         skip = 0;
       }
     }
@@ -664,8 +493,7 @@
       }
       if (!skip) {
         fiat_p256_point_add(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2],
-                            0 /* not mixed */, p_pre_comp[idx][0], *y,
-                            p_pre_comp[idx][2]);
+                            p_pre_comp[idx][0], *y, p_pre_comp[idx][2]);
       } else {
         fiat_p256_copy(ret[0], p_pre_comp[idx][0]);
         fiat_p256_copy(ret[1], *y);
diff --git a/crypto/fipsmodule/ec/p256_test.cc b/crypto/fipsmodule/ec/p256_test.cc
index 88dc81e..3ebe854 100644
--- a/crypto/fipsmodule/ec/p256_test.cc
+++ b/crypto/fipsmodule/ec/p256_test.cc
@@ -19,7 +19,7 @@
 #if !defined(OPENSSL_NO_ASM) && defined(__GNUC__) && defined(__x86_64__) && \
     defined(SUPPORTS_ABI_TEST)
 extern "C" {
-#include "../../../third_party/fiat/p256_64.h"
+#include "../../../third_party/fiat/p256_field.c.inc"
 }
 
 TEST(P256Test, AdxMulABI) {
diff --git a/gen/sources.bzl b/gen/sources.bzl
index 9f976f0..57bd217 100644
--- a/gen/sources.bzl
+++ b/gen/sources.bzl
@@ -670,7 +670,10 @@
     "third_party/fiat/p256_32.h",
     "third_party/fiat/p256_64.h",
     "third_party/fiat/p256_64_msvc.h",
-    "third_party/fiat/p256_bedrock.c.inc",
+    "third_party/fiat/p256_field.c.inc",
+    "third_party/fiat/p256_field_32.br.c.inc",
+    "third_party/fiat/p256_field_64.br.c.inc",
+    "third_party/fiat/p256_point.br.c.inc",
 ]
 
 crypto_sources_asm = [
diff --git a/gen/sources.cmake b/gen/sources.cmake
index 978ada9..5adf810 100644
--- a/gen/sources.cmake
+++ b/gen/sources.cmake
@@ -688,7 +688,10 @@
   third_party/fiat/p256_32.h
   third_party/fiat/p256_64.h
   third_party/fiat/p256_64_msvc.h
-  third_party/fiat/p256_bedrock.c.inc
+  third_party/fiat/p256_field.c.inc
+  third_party/fiat/p256_field_32.br.c.inc
+  third_party/fiat/p256_field_64.br.c.inc
+  third_party/fiat/p256_point.br.c.inc
 )
 
 set(
diff --git a/gen/sources.gni b/gen/sources.gni
index 4a7947c..37761d8 100644
--- a/gen/sources.gni
+++ b/gen/sources.gni
@@ -670,7 +670,10 @@
   "third_party/fiat/p256_32.h",
   "third_party/fiat/p256_64.h",
   "third_party/fiat/p256_64_msvc.h",
-  "third_party/fiat/p256_bedrock.c.inc",
+  "third_party/fiat/p256_field.c.inc",
+  "third_party/fiat/p256_field_32.br.c.inc",
+  "third_party/fiat/p256_field_64.br.c.inc",
+  "third_party/fiat/p256_point.br.c.inc",
 ]
 
 crypto_sources_asm = [
diff --git a/gen/sources.json b/gen/sources.json
index 6284d04..da10b20 100644
--- a/gen/sources.json
+++ b/gen/sources.json
@@ -652,7 +652,10 @@
       "third_party/fiat/p256_32.h",
       "third_party/fiat/p256_64.h",
       "third_party/fiat/p256_64_msvc.h",
-      "third_party/fiat/p256_bedrock.c.inc"
+      "third_party/fiat/p256_field.c.inc",
+      "third_party/fiat/p256_field_32.br.c.inc",
+      "third_party/fiat/p256_field_64.br.c.inc",
+      "third_party/fiat/p256_point.br.c.inc"
     ],
     "asm": [
       "crypto/curve25519/asm/x25519-asm-arm.S",
diff --git a/gen/sources.mk b/gen/sources.mk
index 944b9dc..2d9339c 100644
--- a/gen/sources.mk
+++ b/gen/sources.mk
@@ -662,7 +662,10 @@
   third_party/fiat/p256_32.h \
   third_party/fiat/p256_64.h \
   third_party/fiat/p256_64_msvc.h \
-  third_party/fiat/p256_bedrock.c.inc
+  third_party/fiat/p256_field.c.inc \
+  third_party/fiat/p256_field_32.br.c.inc \
+  third_party/fiat/p256_field_64.br.c.inc \
+  third_party/fiat/p256_point.br.c.inc
 
 boringssl_crypto_sources_asm := \
   crypto/curve25519/asm/x25519-asm-arm.S \
diff --git a/third_party/fiat/README.md b/third_party/fiat/README.md
index 4595ba6..3f98155 100644
--- a/third_party/fiat/README.md
+++ b/third_party/fiat/README.md
@@ -29,7 +29,9 @@
 `bedrock_unverified_platform.c.inc` provides the platform functionality code
 thus translated expects.
 
-The P256 coordinate subtraction function in 64-bit C code generated by Fiat
-Cryptography has been manually replaced with a version written in Bedrock.
-This is an early step towards using Bedrock more widely, both on its own and as
-an alternative pathway from Fiat Cryptography to C.
+The P256 point doubling and addition are implemented in this manner in
+p256_coord.br.c.inc. This code originates from Fiat Cryptogaphy commit
+b8890d810b812180fb224356e0f0b932afade414 file src/Bedrock/P256.v
+Field arithmetic operations are dispatched in p256_field.c.inc to
+platform-specific code including assembly, fiat-crypto-generated C, and
+Bedrock-generated C.
diff --git a/third_party/fiat/bedrock_unverified_platform.c.inc b/third_party/fiat/bedrock_unverified_platform.c.inc
index fbfe90f..89fe7d0 100644
--- a/third_party/fiat/bedrock_unverified_platform.c.inc
+++ b/third_party/fiat/bedrock_unverified_platform.c.inc
@@ -34,6 +34,10 @@
   OPENSSL_memset((void *)d, v, n);
 }
 
+static inline void br_memcxor(uintptr_t d, uintptr_t s, uintptr_t n, uintptr_t mask) {
+  constant_time_conditional_memxor((void*)d, (void*)s, n, mask);
+}
+
 // CPU Arithmetic
 
 static inline br_word_t br_full_add(br_word_t x, br_word_t y, br_word_t carry,
diff --git a/third_party/fiat/p256_64.h b/third_party/fiat/p256_64.h
index 21f7a98..437066a 100644
--- a/third_party/fiat/p256_64.h
+++ b/third_party/fiat/p256_64.h
@@ -1,6 +1,6 @@
 #include <openssl/base.h>
 #include "bedrock_unverified_platform.c.inc"
-#include "p256_bedrock.c.inc"
+#include "p256_field_64.br.c.inc"
 #include "../../crypto/internal.h"
 
 #if !defined(OPENSSL_NO_ASM) && defined(__GNUC__) && defined(__x86_64__)
@@ -177,6 +177,7 @@
  *
  */
 static FIAT_P256_FIAT_INLINE void fiat_p256_mul(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) {
+  // NOTE: edited by hand, see third_party/fiat/README.md
 #if !defined(OPENSSL_NO_ASM) && defined(__GNUC__) && defined(__x86_64__)
   if (CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable() &&
     CRYPTO_is_ADX_capable()) {
@@ -491,6 +492,7 @@
  *
  */
 static FIAT_P256_FIAT_INLINE void fiat_p256_square(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1) {
+  // NOTE: edited by hand, see third_party/fiat/README.md
 #if !defined(OPENSSL_NO_ASM) && defined(__GNUC__) && defined(__x86_64__)
   if (CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable() &&
     CRYPTO_is_ADX_capable()) {
@@ -806,45 +808,8 @@
  *
  */
 static FIAT_P256_FIAT_INLINE void fiat_p256_add(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) {
-  uint64_t x1;
-  fiat_p256_uint1 x2;
-  uint64_t x3;
-  fiat_p256_uint1 x4;
-  uint64_t x5;
-  fiat_p256_uint1 x6;
-  uint64_t x7;
-  fiat_p256_uint1 x8;
-  uint64_t x9;
-  fiat_p256_uint1 x10;
-  uint64_t x11;
-  fiat_p256_uint1 x12;
-  uint64_t x13;
-  fiat_p256_uint1 x14;
-  uint64_t x15;
-  fiat_p256_uint1 x16;
-  uint64_t x17;
-  fiat_p256_uint1 x18;
-  uint64_t x19;
-  uint64_t x20;
-  uint64_t x21;
-  uint64_t x22;
-  fiat_p256_addcarryx_u64(&x1, &x2, 0x0, (arg1[0]), (arg2[0]));
-  fiat_p256_addcarryx_u64(&x3, &x4, x2, (arg1[1]), (arg2[1]));
-  fiat_p256_addcarryx_u64(&x5, &x6, x4, (arg1[2]), (arg2[2]));
-  fiat_p256_addcarryx_u64(&x7, &x8, x6, (arg1[3]), (arg2[3]));
-  fiat_p256_subborrowx_u64(&x9, &x10, 0x0, x1, UINT64_C(0xffffffffffffffff));
-  fiat_p256_subborrowx_u64(&x11, &x12, x10, x3, UINT32_C(0xffffffff));
-  fiat_p256_subborrowx_u64(&x13, &x14, x12, x5, 0x0);
-  fiat_p256_subborrowx_u64(&x15, &x16, x14, x7, UINT64_C(0xffffffff00000001));
-  fiat_p256_subborrowx_u64(&x17, &x18, x16, x8, 0x0);
-  fiat_p256_cmovznz_u64(&x19, x18, x9, x1);
-  fiat_p256_cmovznz_u64(&x20, x18, x11, x3);
-  fiat_p256_cmovznz_u64(&x21, x18, x13, x5);
-  fiat_p256_cmovznz_u64(&x22, x18, x15, x7);
-  out1[0] = x19;
-  out1[1] = x20;
-  out1[2] = x21;
-  out1[3] = x22;
+  // NOTE: edited by hand, see third_party/fiat/README.md
+  p256_coord_add((br_word_t)out1, (br_word_t)arg1, (br_word_t)arg2);
 }
 
 /*
@@ -859,6 +824,7 @@
  *
  */
 static FIAT_P256_FIAT_INLINE void fiat_p256_sub(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) {
+  // NOTE: edited by hand, see third_party/fiat/README.md
   p256_coord_sub((br_word_t)out1, (br_word_t)arg1, (br_word_t)arg2);
 }
 
diff --git a/third_party/fiat/p256_64_msvc.h b/third_party/fiat/p256_64_msvc.h
index 0e56867..aff43d3 100644
--- a/third_party/fiat/p256_64_msvc.h
+++ b/third_party/fiat/p256_64_msvc.h
@@ -18,7 +18,7 @@
 /*                            if x1 & (2^256-1) < 2^255 then x1 & (2^256-1) else (x1 & (2^256-1)) - 2^256 */
 
 #include "bedrock_unverified_platform.c.inc"
-#include "p256_bedrock.c.inc"
+#include "p256_field_64.br.c.inc"
 #include <stdint.h>
 #include <intrin.h>
 #if defined(_M_X64)
@@ -773,45 +773,8 @@
  *
  */
 static FIAT_P256_FIAT_INLINE void fiat_p256_add(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) {
-  uint64_t x1;
-  fiat_p256_uint1 x2;
-  uint64_t x3;
-  fiat_p256_uint1 x4;
-  uint64_t x5;
-  fiat_p256_uint1 x6;
-  uint64_t x7;
-  fiat_p256_uint1 x8;
-  uint64_t x9;
-  fiat_p256_uint1 x10;
-  uint64_t x11;
-  fiat_p256_uint1 x12;
-  uint64_t x13;
-  fiat_p256_uint1 x14;
-  uint64_t x15;
-  fiat_p256_uint1 x16;
-  uint64_t x17;
-  fiat_p256_uint1 x18;
-  uint64_t x19;
-  uint64_t x20;
-  uint64_t x21;
-  uint64_t x22;
-  fiat_p256_addcarryx_u64(&x1, &x2, 0x0, (arg1[0]), (arg2[0]));
-  fiat_p256_addcarryx_u64(&x3, &x4, x2, (arg1[1]), (arg2[1]));
-  fiat_p256_addcarryx_u64(&x5, &x6, x4, (arg1[2]), (arg2[2]));
-  fiat_p256_addcarryx_u64(&x7, &x8, x6, (arg1[3]), (arg2[3]));
-  fiat_p256_subborrowx_u64(&x9, &x10, 0x0, x1, UINT64_C(0xffffffffffffffff));
-  fiat_p256_subborrowx_u64(&x11, &x12, x10, x3, UINT32_C(0xffffffff));
-  fiat_p256_subborrowx_u64(&x13, &x14, x12, x5, 0x0);
-  fiat_p256_subborrowx_u64(&x15, &x16, x14, x7, UINT64_C(0xffffffff00000001));
-  fiat_p256_subborrowx_u64(&x17, &x18, x16, x8, 0x0);
-  fiat_p256_cmovznz_u64(&x19, x18, x9, x1);
-  fiat_p256_cmovznz_u64(&x20, x18, x11, x3);
-  fiat_p256_cmovznz_u64(&x21, x18, x13, x5);
-  fiat_p256_cmovznz_u64(&x22, x18, x15, x7);
-  out1[0] = x19;
-  out1[1] = x20;
-  out1[2] = x21;
-  out1[3] = x22;
+  // NOTE: edited by hand, see third_party/fiat/README.md
+  p256_coord_add((br_word_t)out1, (br_word_t)arg1, (br_word_t)arg2);
 }
 
 /*
diff --git a/third_party/fiat/p256_bedrock.c.inc b/third_party/fiat/p256_bedrock.c.inc
deleted file mode 100644
index fb8c3de..0000000
--- a/third_party/fiat/p256_bedrock.c.inc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Generated from Bedrock code in Fiat Cryptography. Avoid editing directly.
-
-static_assert(BR_WORD_MAX == UINT64_MAX, "");
-
-static inline void p256_coord_sub(br_word_t out, br_word_t x, br_word_t y) {
-  br_word_t borrow, t0, t1, t2, t3, mask, carry, r0, r1, r2, r3;
-  borrow = br_full_sub(_br_load(x), _br_load(y), (br_word_t)0, &t0);
-  borrow = br_full_sub(_br_load(x+8), _br_load(y+8), borrow, &t1);
-  borrow = br_full_sub(_br_load((x+8)+8), _br_load((y+8)+8), borrow, &t2);
-  borrow = br_full_sub(_br_load(((x+8)+8)+8), _br_load(((y+8)+8)+8), borrow, &t3);
-  mask = br_value_barrier(0u-borrow);
-  carry = br_full_add(t0, mask, (br_word_t)0, &r0);
-  carry = br_full_add(t1, mask&0xffffffff, carry, &r1);
-  carry = br_full_add(t2, (br_word_t)0, carry, &r2);
-  carry = br_full_add(t3, mask&0xffffffff00000001, carry, &r3);
-  _br_store(out, r0);
-  _br_store(out+8, r1);
-  _br_store((out+8)+8, r2);
-  _br_store(((out+8)+8)+8, r3);
-  return;
-}
diff --git a/third_party/fiat/p256_field.c.inc b/third_party/fiat/p256_field.c.inc
new file mode 100644
index 0000000..bbd25de
--- /dev/null
+++ b/third_party/fiat/p256_field.c.inc
@@ -0,0 +1,26 @@
+#include <openssl/base.h>
+#include "bedrock_unverified_platform.c.inc"
+
+#if defined(BORINGSSL_HAS_UINT128)
+#include "p256_64.h"
+#elif defined(OPENSSL_64_BIT)
+#include "p256_64_msvc.h"
+#else
+#include "p256_field_32.br.c.inc"
+#include "p256_32.h"
+// the 32-bit Bedrock-generated field halving calls Fiat-C code for add, sub
+static inline void p256_coord_add(br_word_t out, br_word_t x, br_word_t y) {
+  fiat_p256_add((uint32_t*)out, (const uint32_t*)x, (const uint32_t*)y);
+}
+static inline void p256_coord_sub(br_word_t out, br_word_t x, br_word_t y) {
+  fiat_p256_sub((uint32_t*)out, (const uint32_t*)x, (const uint32_t*)y);
+}
+#endif
+
+// the Bedrock-generated point operations call Fiat-C or Fiat-x86 mul, sqr
+static inline void p256_coord_mul(br_word_t out, br_word_t x, br_word_t y) {
+  fiat_p256_mul((br_word_t*)out, (const br_word_t*)x, (const br_word_t*)y);
+}
+static inline void p256_coord_sqr(br_word_t out, br_word_t x) {
+  fiat_p256_square((br_word_t*)out, (const br_word_t*)x);
+}
diff --git a/third_party/fiat/p256_field_32.br.c.inc b/third_party/fiat/p256_field_32.br.c.inc
new file mode 100644
index 0000000..4fe7c1a
--- /dev/null
+++ b/third_party/fiat/p256_field_32.br.c.inc
@@ -0,0 +1,73 @@
+// Generated from Bedrock code in Fiat Cryptography. Avoid editing directly.
+
+static inline br_word_t shrd(br_word_t lo, br_word_t hi, br_word_t n) {
+  br_word_t res;
+  res = lo>>(n&(sizeof(br_word_t)*8-1));
+  if (n) {
+    res = (hi<<((((((0u-(br_word_t)1)>>27)&63)+1)-n)&(sizeof(br_word_t)*8-1)))|res;
+  } else {
+    /*skip*/
+  }
+  return res;
+}
+
+static inline br_word_t p256_coord_nonzero(br_word_t p_x) {
+  br_word_t nz;
+  nz = (((_br_load(p_x))|(_br_load(p_x+4)))|(_br_load((p_x+4)+4)))|(_br_load(((p_x+4)+4)+4));
+  nz = nz|(_br_load((((p_x+4)+4)+4)+4));
+  nz = nz|(_br_load(((((p_x+4)+4)+4)+4)+4));
+  nz = nz|(_br_load((((((p_x+4)+4)+4)+4)+4)+4));
+  nz = nz|(_br_load(((((((p_x+4)+4)+4)+4)+4)+4)+4));
+  nz = br_broadcast_nonzero(nz);
+  return nz;
+}
+
+static inline void u256_shr(br_word_t p_out, br_word_t p_x, br_word_t n) {
+  br_word_t x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7;
+  x0 = _br_load(p_x);
+  x1 = _br_load(p_x+4);
+  x2 = _br_load((p_x+4)+4);
+  x3 = _br_load(((p_x+4)+4)+4);
+  x4 = _br_load((((p_x+4)+4)+4)+4);
+  x5 = _br_load(((((p_x+4)+4)+4)+4)+4);
+  x6 = _br_load((((((p_x+4)+4)+4)+4)+4)+4);
+  x7 = _br_load(((((((p_x+4)+4)+4)+4)+4)+4)+4);
+  y0 = shrd(x0, x1, n);
+  y1 = shrd(x1, x2, n);
+  y2 = shrd(x2, x3, n);
+  y3 = shrd(x3, x4, n);
+  y4 = shrd(x4, x5, n);
+  y5 = shrd(x5, x6, n);
+  y6 = shrd(x6, x7, n);
+  y7 = x7>>(n&(sizeof(br_word_t)*8-1));
+  _br_store(p_out, y0);
+  _br_store(p_out+4, y1);
+  _br_store((p_out+4)+4, y2);
+  _br_store(((p_out+4)+4)+4, y3);
+  _br_store((((p_out+4)+4)+4)+4, y4);
+  _br_store(((((p_out+4)+4)+4)+4)+4, y5);
+  _br_store((((((p_out+4)+4)+4)+4)+4)+4, y6);
+  _br_store(((((((p_out+4)+4)+4)+4)+4)+4)+4, y7);
+  return;
+}
+
+static inline void u256_set_p256_minushalf_conditional(br_word_t p_out, br_word_t mask) {
+  br_word_t mh0, mh1, mh2, mh3, mh4, mh5, mh6, mh7;
+  mh0 = 0u-(br_word_t)1;
+  mh1 = mh0;
+  mh2 = mh0>>1;
+  mh3 = (br_word_t)0;
+  mh4 = (br_word_t)0;
+  mh5 = (br_word_t)1<<31;
+  mh6 = mh5;
+  mh7 = mh2;
+  _br_store(p_out, mask&mh0);
+  _br_store(p_out+4, mask&mh1);
+  _br_store((p_out+4)+4, mask&mh2);
+  _br_store(((p_out+4)+4)+4, mask&mh3);
+  _br_store((((p_out+4)+4)+4)+4, mask&mh4);
+  _br_store(((((p_out+4)+4)+4)+4)+4, mask&mh5);
+  _br_store((((((p_out+4)+4)+4)+4)+4)+4, mask&mh6);
+  _br_store(((((((p_out+4)+4)+4)+4)+4)+4)+4, mask&mh7);
+  return;
+}
diff --git a/third_party/fiat/p256_field_64.br.c.inc b/third_party/fiat/p256_field_64.br.c.inc
new file mode 100644
index 0000000..756d237
--- /dev/null
+++ b/third_party/fiat/p256_field_64.br.c.inc
@@ -0,0 +1,88 @@
+// Generated from Bedrock code in Fiat Cryptography. Avoid editing directly.
+
+static inline br_word_t shrd(br_word_t lo, br_word_t hi, br_word_t n) {
+  br_word_t res;
+  res = lo>>(n&(sizeof(br_word_t)*8-1));
+  if (n) {
+    res = (hi<<((((((0u-(br_word_t)1)>>27)&63)+1)-n)&(sizeof(br_word_t)*8-1)))|res;
+  } else {
+    /*skip*/
+  }
+  return res;
+}
+
+static inline void p256_coord_add(br_word_t p_out, br_word_t p_x, br_word_t p_y) {
+  br_word_t r4, carry, t0, t1, t2, borrow, t3, r0, r1, r2, r3;
+  carry = br_full_add(_br_load(p_x), _br_load(p_y), (br_word_t)0, &t0);
+  carry = br_full_add(_br_load(p_x+8), _br_load(p_y+8), carry, &t1);
+  carry = br_full_add(_br_load((p_x+8)+8), _br_load((p_y+8)+8), carry, &t2);
+  carry = br_full_add(_br_load(((p_x+8)+8)+8), _br_load(((p_y+8)+8)+8), carry, &t3);
+  borrow = br_full_sub(t0, (br_word_t)0xffffffffffffffff, (br_word_t)0, &r0);
+  borrow = br_full_sub(t1, (br_word_t)0xffffffff, borrow, &r1);
+  borrow = br_full_sub(t2, (br_word_t)0, borrow, &r2);
+  borrow = br_full_sub(t3, (br_word_t)0xffffffff00000001, borrow, &r3);
+  borrow = br_full_sub(carry, (br_word_t)0, borrow, &r4);
+  r0 = br_cmov(borrow, t0, r0);
+  r1 = br_cmov(borrow, t1, r1);
+  r2 = br_cmov(borrow, t2, r2);
+  r3 = br_cmov(borrow, t3, r3);
+  _br_store(p_out, r0);
+  _br_store(p_out+8, r1);
+  _br_store((p_out+8)+8, r2);
+  _br_store(((p_out+8)+8)+8, r3);
+  return;
+}
+
+static inline void p256_coord_sub(br_word_t out, br_word_t x, br_word_t y) {
+  br_word_t borrow, t0, t1, t2, t3, mask, carry, r0, r1, r2, r3;
+  borrow = br_full_sub(_br_load(x), _br_load(y), (br_word_t)0, &t0);
+  borrow = br_full_sub(_br_load(x+8), _br_load(y+8), borrow, &t1);
+  borrow = br_full_sub(_br_load((x+8)+8), _br_load((y+8)+8), borrow, &t2);
+  borrow = br_full_sub(_br_load(((x+8)+8)+8), _br_load(((y+8)+8)+8), borrow, &t3);
+  mask = br_value_barrier(0u-borrow);
+  carry = br_full_add(t0, mask, (br_word_t)0, &r0);
+  carry = br_full_add(t1, mask&0xffffffff, carry, &r1);
+  carry = br_full_add(t2, (br_word_t)0, carry, &r2);
+  carry = br_full_add(t3, mask&0xffffffff00000001, carry, &r3);
+  _br_store(out, r0);
+  _br_store(out+8, r1);
+  _br_store((out+8)+8, r2);
+  _br_store(((out+8)+8)+8, r3);
+  return;
+}
+
+static inline br_word_t p256_coord_nonzero(br_word_t p_x) {
+  br_word_t nz;
+  nz = br_broadcast_nonzero((((_br_load(p_x))|(_br_load(p_x+8)))|(_br_load((p_x+8)+8)))|(_br_load(((p_x+8)+8)+8)));
+  return nz;
+}
+
+static inline void u256_shr(br_word_t p_out, br_word_t p_x, br_word_t n) {
+  br_word_t x0, x1, x2, x3, y0, y1, y2, y3;
+  x0 = _br_load(p_x);
+  x1 = _br_load(p_x+8);
+  x2 = _br_load((p_x+8)+8);
+  x3 = _br_load(((p_x+8)+8)+8);
+  y0 = shrd(x0, x1, n);
+  y1 = shrd(x1, x2, n);
+  y2 = shrd(x2, x3, n);
+  y3 = x3>>(n&(sizeof(br_word_t)*8-1));
+  _br_store(p_out, y0);
+  _br_store(p_out+8, y1);
+  _br_store((p_out+8)+8, y2);
+  _br_store(((p_out+8)+8)+8, y3);
+  return;
+}
+
+static inline void u256_set_p256_minushalf_conditional(br_word_t p_out, br_word_t mask) {
+  br_word_t mh0, mh1, mh2, mh3;
+  mh0 = 0u-(br_word_t)1;
+  mh1 = mh0>>((br_word_t)33&(sizeof(br_word_t)*8-1));
+  mh2 = mh0<<((br_word_t)63&(sizeof(br_word_t)*8-1));
+  mh3 = (mh0<<((br_word_t)32&(sizeof(br_word_t)*8-1)))>>1;
+  _br_store(p_out, mask&mh0);
+  _br_store(p_out+8, mask&mh1);
+  _br_store((p_out+8)+8, mask&mh2);
+  _br_store(((p_out+8)+8)+8, mask&mh3);
+  return;
+}
diff --git a/third_party/fiat/p256_point.br.c.inc b/third_party/fiat/p256_point.br.c.inc
new file mode 100644
index 0000000..84c873a
--- /dev/null
+++ b/third_party/fiat/p256_point.br.c.inc
@@ -0,0 +1,115 @@
+// Generated from Bedrock code in Fiat Cryptography. Avoid editing directly.
+
+static inline br_word_t br_broadcast_odd(br_word_t x) {
+  br_word_t y;
+  x = br_value_barrier(x&1);
+  y = 0u-x;
+  return y;
+}
+
+static inline void p256_coord_halve(br_word_t y, br_word_t x) {
+  br_word_t m, mmh;
+  uint8_t _br_stackalloc_mmh[32] = {0}; mmh = (br_word_t)&_br_stackalloc_mmh;
+  m = br_broadcast_odd(_br_load(x));
+  u256_set_p256_minushalf_conditional(mmh, m);
+  u256_shr(y, x, (br_word_t)1);
+  p256_coord_sub(y, y, mmh);
+  return;
+}
+
+static inline br_word_t p256_point_iszero(br_word_t p_P) {
+  br_word_t z, nz;
+  nz = p256_coord_nonzero((p_P+32)+32);
+  z = ~nz;
+  return z;
+}
+
+static inline void p256_point_double(br_word_t out, br_word_t in1) {
+  br_word_t t2, tmp, A, D;
+  uint8_t _br_stackalloc_D[32] = {0}; D = (br_word_t)&_br_stackalloc_D;
+  uint8_t _br_stackalloc_A[32] = {0}; A = (br_word_t)&_br_stackalloc_A;
+  uint8_t _br_stackalloc_tmp[32] = {0}; tmp = (br_word_t)&_br_stackalloc_tmp;
+  p256_coord_add(D, in1+32, in1+32);
+  p256_coord_sqr(tmp, (in1+32)+32);
+  p256_coord_sqr(D, D);
+  p256_coord_mul((out+32)+32, (in1+32)+32, in1+32);
+  p256_coord_add((out+32)+32, (out+32)+32, (out+32)+32);
+  p256_coord_add(A, in1, tmp);
+  p256_coord_sub(tmp, in1, tmp);
+  uint8_t _br_stackalloc_t2[32] = {0}; t2 = (br_word_t)&_br_stackalloc_t2;
+  p256_coord_add(t2, tmp, tmp);
+  p256_coord_add(tmp, t2, tmp);
+  p256_coord_sqr(out+32, D);
+  p256_coord_mul(A, A, tmp);
+  p256_coord_mul(D, D, in1);
+  p256_coord_sqr(out, A);
+  p256_coord_add(tmp, D, D);
+  p256_coord_sub(out, out, tmp);
+  p256_coord_sub(D, D, out);
+  p256_coord_mul(D, D, A);
+  p256_coord_halve(out+32, out+32);
+  p256_coord_sub(out+32, D, out+32);
+  return;
+}
+
+static inline br_word_t p256_point_add_nz_nz_neq(br_word_t p_out, br_word_t p_P, br_word_t p_Q) {
+  br_word_t z1z1, z2z2, u1, Hsqr, ok, different_x, different_y, u2, Hcub, s1, r, h, s2;
+  uint8_t _br_stackalloc_z1z1[32] = {0}; z1z1 = (br_word_t)&_br_stackalloc_z1z1;
+  uint8_t _br_stackalloc_z2z2[32] = {0}; z2z2 = (br_word_t)&_br_stackalloc_z2z2;
+  uint8_t _br_stackalloc_u1[32] = {0}; u1 = (br_word_t)&_br_stackalloc_u1;
+  uint8_t _br_stackalloc_u2[32] = {0}; u2 = (br_word_t)&_br_stackalloc_u2;
+  uint8_t _br_stackalloc_h[32] = {0}; h = (br_word_t)&_br_stackalloc_h;
+  uint8_t _br_stackalloc_s1[32] = {0}; s1 = (br_word_t)&_br_stackalloc_s1;
+  uint8_t _br_stackalloc_s2[32] = {0}; s2 = (br_word_t)&_br_stackalloc_s2;
+  uint8_t _br_stackalloc_r[32] = {0}; r = (br_word_t)&_br_stackalloc_r;
+  uint8_t _br_stackalloc_Hsqr[32] = {0}; Hsqr = (br_word_t)&_br_stackalloc_Hsqr;
+  uint8_t _br_stackalloc_Hcub[32] = {0}; Hcub = (br_word_t)&_br_stackalloc_Hcub;
+  p256_coord_sqr(z1z1, (p_P+32)+32);
+  p256_coord_mul(u2, p_Q, z1z1);
+  p256_coord_sqr(z2z2, (p_Q+32)+32);
+  p256_coord_mul(u1, p_P, z2z2);
+  p256_coord_sub(h, u2, u1);
+  p256_coord_mul(s2, (p_P+32)+32, z1z1);
+  p256_coord_mul((p_out+32)+32, h, (p_P+32)+32);
+  p256_coord_mul((p_out+32)+32, (p_out+32)+32, (p_Q+32)+32);
+  p256_coord_mul(s2, s2, p_Q+32);
+  p256_coord_mul(s1, (p_Q+32)+32, z2z2);
+  p256_coord_mul(s1, s1, p_P+32);
+  p256_coord_sub(r, s2, s1);
+  p256_coord_sqr(Hsqr, h);
+  p256_coord_sqr(p_out, r);
+  p256_coord_mul(Hcub, Hsqr, h);
+  p256_coord_mul(u2, u1, Hsqr);
+  different_x = p256_coord_nonzero(Hcub);
+  different_y = p256_coord_nonzero(p_out);
+  ok = br_value_barrier(different_x|different_y);
+  p256_coord_sub(p_out, p_out, Hcub);
+  p256_coord_sub(p_out, p_out, u2);
+  p256_coord_sub(p_out, p_out, u2);
+  p256_coord_sub(h, u2, p_out);
+  p256_coord_mul(s2, Hcub, s1);
+  p256_coord_mul(h, h, r);
+  p256_coord_sub(p_out+32, h, s2);
+  return ok;
+}
+
+static inline void p256_point_add_vartime_if_doubling(br_word_t p_out, br_word_t p_P, br_word_t p_Q) {
+  br_word_t p_tmp, zeroP, zeroQ, ok, p_sel;
+  zeroP = p256_point_iszero(p_P);
+  zeroQ = p256_point_iszero(p_Q);
+  uint8_t _br_stackalloc_p_tmp[96] = {0}; p_tmp = (br_word_t)&_br_stackalloc_p_tmp;
+  ok = p256_point_add_nz_nz_neq(p_tmp, p_P, p_Q);
+  ok = br_declassify((zeroP|zeroQ)|ok);
+  uint8_t _br_stackalloc_p_sel[96] = {0}; p_sel = (br_word_t)&_br_stackalloc_p_sel;
+  br_memset(p_sel, (br_word_t)0, (br_word_t)96);
+  br_memcxor(p_sel, p_tmp, (br_word_t)96, (~zeroP)&(~zeroQ));
+  br_memcxor(p_sel, p_P, (br_word_t)96, (~zeroP)&zeroQ);
+  br_memcxor(p_sel, p_Q, (br_word_t)96, zeroP&(~zeroQ));
+  if (ok) {
+    /*skip*/
+  } else {
+    p256_point_double(p_sel, p_P);
+  }
+  br_memcpy(p_out, p_sel, (br_word_t)96);
+  return;
+}