crypto/fipsmodule/ec/p256.cc.inc - boringssl - Git at Google

 // Copyright 2020 The BoringSSL Authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // An implementation of the NIST P-256 elliptic curve point multiplication.
 // 256-bit Montgomery form for 64 and 32-bit. Field operations are generated by
 // Fiat, which lives in //third_party/fiat.

 #include <openssl/base.h>

 #include <openssl/bn.h>
 #include <openssl/ec.h>
 #include <openssl/err.h>
 #include <openssl/mem.h>

 #include <assert.h>
 #include <string.h>

 #include <iterator>

 #include "../../internal.h"
 #include "../delocate.h"
 #include "./internal.h"

 #include "../../../third_party/fiat/p256_field.c.inc"
 #include "../../../third_party/fiat/p256_point.br.c.inc"

 // utility functions, handwritten

 #if defined(OPENSSL_64_BIT)
 #define FIAT_P256_NLIMBS 4
 typedef uint64_t fiat_p256_limb_t;
 typedef uint64_t fiat_p256_felem[FIAT_P256_NLIMBS];
 static const fiat_p256_felem fiat_p256_one = {0x1, 0xffffffff00000000,
                                               0xffffffffffffffff, 0xfffffffe};
 #else  // 64BIT; else 32BIT
 #define FIAT_P256_NLIMBS 8
 typedef uint32_t fiat_p256_limb_t;
 typedef uint32_t fiat_p256_felem[FIAT_P256_NLIMBS];
 static const fiat_p256_felem fiat_p256_one = {
     0x1, 0x0, 0x0, 0xffffffff, 0xffffffff, 0xffffffff, 0xfffffffe, 0x0};
 #endif  // 64BIT


 static void fiat_p256_copy(fiat_p256_limb_t out[FIAT_P256_NLIMBS],
                            const fiat_p256_limb_t in1[FIAT_P256_NLIMBS]) {
   for (size_t i = 0; i < FIAT_P256_NLIMBS; i++) {
     out[i] = in1[i];
   }
 }

 static void fiat_p256_cmovznz(fiat_p256_limb_t out[FIAT_P256_NLIMBS],
                               fiat_p256_limb_t t,
                               const fiat_p256_limb_t z[FIAT_P256_NLIMBS],
                               const fiat_p256_limb_t nz[FIAT_P256_NLIMBS]) {
   fiat_p256_selectznz(out, !!t, z, nz);
 }

 static void fiat_p256_from_words(fiat_p256_felem out,
                                  const BN_ULONG in[32 / sizeof(BN_ULONG)]) {
   // Typically, |BN_ULONG| and |fiat_p256_limb_t| will be the same type, but on
   // 64-bit platforms without |uint128_t|, they are different. However, on
   // little-endian systems, |uint64_t[4]| and |uint32_t[8]| have the same
   // layout.
   OPENSSL_memcpy(out, in, 32);
 }

 static void fiat_p256_from_generic(fiat_p256_felem out, const EC_FELEM *in) {
   fiat_p256_from_words(out, in->words);
 }

 static void fiat_p256_to_generic(EC_FELEM *out, const fiat_p256_felem in) {
   // See |fiat_p256_from_words|.
   OPENSSL_memcpy(out->words, in, 32);
 }

 // fiat_p256_inv_square calculates |out| = |in|^{-2}
 //
 // Based on Fermat's Little Theorem:
 //   a^p = a (mod p)
 //   a^{p-1} = 1 (mod p)
 //   a^{p-3} = a^{-2} (mod p)
 static void fiat_p256_inv_square(fiat_p256_felem out,
                                  const fiat_p256_felem in) {
   // This implements the addition chain described in
   // https://briansmith.org/ecc-inversion-addition-chains-01#p256_field_inversion
   fiat_p256_felem x2, x3, x6, x12, x15, x30, x32;
   fiat_p256_square(x2, in);   // 2^2 - 2^1
   fiat_p256_mul(x2, x2, in);  // 2^2 - 2^0

   fiat_p256_square(x3, x2);   // 2^3 - 2^1
   fiat_p256_mul(x3, x3, in);  // 2^3 - 2^0

   fiat_p256_square(x6, x3);
   for (int i = 1; i < 3; i++) {
     fiat_p256_square(x6, x6);
   }                           // 2^6 - 2^3
   fiat_p256_mul(x6, x6, x3);  // 2^6 - 2^0

   fiat_p256_square(x12, x6);
   for (int i = 1; i < 6; i++) {
     fiat_p256_square(x12, x12);
   }                             // 2^12 - 2^6
   fiat_p256_mul(x12, x12, x6);  // 2^12 - 2^0

   fiat_p256_square(x15, x12);
   for (int i = 1; i < 3; i++) {
     fiat_p256_square(x15, x15);
   }                             // 2^15 - 2^3
   fiat_p256_mul(x15, x15, x3);  // 2^15 - 2^0

   fiat_p256_square(x30, x15);
   for (int i = 1; i < 15; i++) {
     fiat_p256_square(x30, x30);
   }                              // 2^30 - 2^15
   fiat_p256_mul(x30, x30, x15);  // 2^30 - 2^0

   fiat_p256_square(x32, x30);
   fiat_p256_square(x32, x32);   // 2^32 - 2^2
   fiat_p256_mul(x32, x32, x2);  // 2^32 - 2^0

   fiat_p256_felem ret;
   fiat_p256_square(ret, x32);
   for (int i = 1; i < 31 + 1; i++) {
     fiat_p256_square(ret, ret);
   }                             // 2^64 - 2^32
   fiat_p256_mul(ret, ret, in);  // 2^64 - 2^32 + 2^0

   for (int i = 0; i < 96 + 32; i++) {
     fiat_p256_square(ret, ret);
   }                              // 2^192 - 2^160 + 2^128
   fiat_p256_mul(ret, ret, x32);  // 2^192 - 2^160 + 2^128 + 2^32 - 2^0

   for (int i = 0; i < 32; i++) {
     fiat_p256_square(ret, ret);
   }                              // 2^224 - 2^192 + 2^160 + 2^64 - 2^32
   fiat_p256_mul(ret, ret, x32);  // 2^224 - 2^192 + 2^160 + 2^64 - 2^0

   for (int i = 0; i < 30; i++) {
     fiat_p256_square(ret, ret);
   }                              // 2^254 - 2^222 + 2^190 + 2^94 - 2^30
   fiat_p256_mul(ret, ret, x30);  // 2^254 - 2^222 + 2^190 + 2^94 - 2^0

   fiat_p256_square(ret, ret);
   fiat_p256_square(out, ret);  // 2^256 - 2^224 + 2^192 + 2^96 - 2^2
 }

 // Group operations
 // ----------------
 //
 // Building on top of the field operations we have the operations on the
 // elliptic curve group itself. Points on the curve are represented in Jacobian
 // coordinates.

 static void fiat_p256_point_double(fiat_p256_felem x_out, fiat_p256_felem y_out,
                                    fiat_p256_felem z_out,
                                    const fiat_p256_felem x_in,
                                    const fiat_p256_felem y_in,
                                    const fiat_p256_felem z_in) {
   uint8_t out[3*32], in[3*32];
   static_assert(sizeof(fiat_p256_felem) == 32);
   OPENSSL_memcpy(&in[0], x_in, 32);
   OPENSSL_memcpy(&in[32], y_in, 32);
   OPENSSL_memcpy(&in[64], z_in, 32);
   p256_point_double((br_word_t)out, (br_word_t)in);
   OPENSSL_memcpy(x_out, &out[0], 32);
   OPENSSL_memcpy(y_out, &out[32], 32);
   OPENSSL_memcpy(z_out, &out[64], 32);
 }

 static void fiat_p256_point_add(fiat_p256_felem x3, fiat_p256_felem y3,
                                 fiat_p256_felem z3, const fiat_p256_felem x1,
                                 const fiat_p256_felem y1,
                                 const fiat_p256_felem z1,
                                 const fiat_p256_felem x2,
                                 const fiat_p256_felem y2,
                                 const fiat_p256_felem z2) {
   uint8_t out[3 * 32], in1[3 * 32], in2[3 * 32];
   static_assert(sizeof(fiat_p256_felem) == 32);
   OPENSSL_memcpy(&in1[0], x1, 32);
   OPENSSL_memcpy(&in1[32], y1, 32);
   OPENSSL_memcpy(&in1[64], z1, 32);
   OPENSSL_memcpy(&in2[0], x2, 32);
   OPENSSL_memcpy(&in2[32], y2, 32);
   OPENSSL_memcpy(&in2[64], z2, 32);
   p256_point_add_vartime_if_doubling((br_word_t)out, (br_word_t)in1,
                                      (br_word_t)in2);
   OPENSSL_memcpy(x3, &out[0], 32);
   OPENSSL_memcpy(y3, &out[32], 32);
   OPENSSL_memcpy(z3, &out[64], 32);
 }
 #include "./p256_table.h"

 // fiat_p256_select_point_affine selects the |idx-1|th point from a
 // precomputation table and copies it to out. If |idx| is zero, the output is
 // the point at infinity.
 static void fiat_p256_select_point_affine(
     const fiat_p256_limb_t idx, size_t size,
     const fiat_p256_felem pre_comp[/*size*/][2], fiat_p256_felem out[3]) {
   OPENSSL_memset(out, 0, sizeof(fiat_p256_felem) * 3);
   for (size_t i = 0; i < size; i++) {
     fiat_p256_limb_t mismatch = i ^ (idx - 1);
     fiat_p256_cmovznz(out[0], mismatch, pre_comp[i][0], out[0]);
     fiat_p256_cmovznz(out[1], mismatch, pre_comp[i][1], out[1]);
   }
   fiat_p256_cmovznz(out[2], idx, out[2], fiat_p256_one);
 }

 // fiat_p256_select_point selects the |idx|th point from a precomputation table
 // and copies it to out.
 static void fiat_p256_select_point(const fiat_p256_limb_t idx, size_t size,
                                    const fiat_p256_felem pre_comp[/*size*/][3],
                                    fiat_p256_felem out[3]) {
   OPENSSL_memset(out, 0, sizeof(fiat_p256_felem) * 3);
   for (size_t i = 0; i < size; i++) {
     fiat_p256_limb_t mismatch = i ^ idx;
     fiat_p256_cmovznz(out[0], mismatch, pre_comp[i][0], out[0]);
     fiat_p256_cmovznz(out[1], mismatch, pre_comp[i][1], out[1]);
     fiat_p256_cmovznz(out[2], mismatch, pre_comp[i][2], out[2]);
   }
 }

 // fiat_p256_get_bit returns the |i|th bit in |in|.
 static crypto_word_t fiat_p256_get_bit(const EC_SCALAR *in, int i) {
   if (i < 0 || i >= 256) {
     return 0;
   }
 #if defined(OPENSSL_64_BIT)
   static_assert(sizeof(BN_ULONG) == 8, "BN_ULONG was not 64-bit");
   return (in->words[i >> 6] >> (i & 63)) & 1;
 #else
   static_assert(sizeof(BN_ULONG) == 4, "BN_ULONG was not 32-bit");
   return (in->words[i >> 5] >> (i & 31)) & 1;
 #endif
 }

 // OPENSSL EC_METHOD FUNCTIONS

 // Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
 // (X/Z^2, Y/Z^3).
 static int ec_GFp_nistp256_point_get_affine_coordinates(
     const EC_GROUP *group, const EC_JACOBIAN *point, EC_FELEM *x_out,
     EC_FELEM *y_out) {
   if (constant_time_declassify_int(
           ec_GFp_simple_is_at_infinity(group, point))) {
     OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY);
     return 0;
   }

   fiat_p256_felem z1, z2;
   fiat_p256_from_generic(z1, &point->Z);
   fiat_p256_inv_square(z2, z1);

   if (x_out != NULL) {
     fiat_p256_felem x;
     fiat_p256_from_generic(x, &point->X);
     fiat_p256_mul(x, x, z2);
     fiat_p256_to_generic(x_out, x);
   }

   if (y_out != NULL) {
     fiat_p256_felem y;
     fiat_p256_from_generic(y, &point->Y);
     fiat_p256_square(z2, z2);  // z^-4
     fiat_p256_mul(y, y, z1);   // y * z
     fiat_p256_mul(y, y, z2);   // y * z^-3
     fiat_p256_to_generic(y_out, y);
   }

   return 1;
 }

 static void ec_GFp_nistp256_add(const EC_GROUP *group, EC_JACOBIAN *r,
                                 const EC_JACOBIAN *a, const EC_JACOBIAN *b) {
   fiat_p256_felem x1, y1, z1, x2, y2, z2;
   fiat_p256_from_generic(x1, &a->X);
   fiat_p256_from_generic(y1, &a->Y);
   fiat_p256_from_generic(z1, &a->Z);
   fiat_p256_from_generic(x2, &b->X);
   fiat_p256_from_generic(y2, &b->Y);
   fiat_p256_from_generic(z2, &b->Z);
   fiat_p256_point_add(x1, y1, z1, x1, y1, z1, x2, y2, z2);
   fiat_p256_to_generic(&r->X, x1);
   fiat_p256_to_generic(&r->Y, y1);
   fiat_p256_to_generic(&r->Z, z1);
 }

 static void ec_GFp_nistp256_dbl(const EC_GROUP *group, EC_JACOBIAN *r,
                                 const EC_JACOBIAN *a) {
   fiat_p256_felem x, y, z;
   fiat_p256_from_generic(x, &a->X);
   fiat_p256_from_generic(y, &a->Y);
   fiat_p256_from_generic(z, &a->Z);
   fiat_p256_point_double(x, y, z, x, y, z);
   fiat_p256_to_generic(&r->X, x);
   fiat_p256_to_generic(&r->Y, y);
   fiat_p256_to_generic(&r->Z, z);
 }

 static void ec_GFp_nistp256_point_mul(const EC_GROUP *group, EC_JACOBIAN *r,
                                       const EC_JACOBIAN *p,
                                       const EC_SCALAR *scalar) {
   fiat_p256_felem p_pre_comp[17][3];
   OPENSSL_memset(&p_pre_comp, 0, sizeof(p_pre_comp));
   // Precompute multiples.
   fiat_p256_from_generic(p_pre_comp[1][0], &p->X);
   fiat_p256_from_generic(p_pre_comp[1][1], &p->Y);
   fiat_p256_from_generic(p_pre_comp[1][2], &p->Z);
   for (size_t j = 2; j <= 16; ++j) {
     if (j & 1) {
       fiat_p256_point_add(p_pre_comp[j][0], p_pre_comp[j][1], p_pre_comp[j][2],
                           p_pre_comp[1][0], p_pre_comp[1][1], p_pre_comp[1][2],
                           p_pre_comp[j - 1][0], p_pre_comp[j - 1][1],
                           p_pre_comp[j - 1][2]);
     } else {
       fiat_p256_point_double(p_pre_comp[j][0], p_pre_comp[j][1],
                              p_pre_comp[j][2], p_pre_comp[j / 2][0],
                              p_pre_comp[j / 2][1], p_pre_comp[j / 2][2]);
     }
   }

   // Set nq to the point at infinity.
   fiat_p256_felem nq[3] = {{0}, {0}, {0}}, ftmp, tmp[3];

   // Loop over |scalar| msb-to-lsb, incorporating |p_pre_comp| every 5th round.
   int skip = 1;  // Save two point operations in the first round.
   for (size_t i = 255; i < 256; i--) {
     // double
     if (!skip) {
       fiat_p256_point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
     }

     // do other additions every 5 doublings
     if (i % 5 == 0) {
       crypto_word_t bits = fiat_p256_get_bit(scalar, i + 4) << 5;
       bits |= fiat_p256_get_bit(scalar, i + 3) << 4;
       bits |= fiat_p256_get_bit(scalar, i + 2) << 3;
       bits |= fiat_p256_get_bit(scalar, i + 1) << 2;
       bits |= fiat_p256_get_bit(scalar, i) << 1;
       bits |= fiat_p256_get_bit(scalar, i - 1);
       crypto_word_t sign, digit;
       ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);

       // select the point to add or subtract, in constant time.
       fiat_p256_select_point((fiat_p256_limb_t)digit, 17,
                              (const fiat_p256_felem(*)[3])p_pre_comp, tmp);
       fiat_p256_opp(ftmp, tmp[1]);  // (X, -Y, Z) is the negative point.
       fiat_p256_cmovznz(tmp[1], (fiat_p256_limb_t)sign, tmp[1], ftmp);

       if (!skip) {
         fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], tmp[0],
                             tmp[1], tmp[2]);
       } else {
         fiat_p256_copy(nq[0], tmp[0]);
         fiat_p256_copy(nq[1], tmp[1]);
         fiat_p256_copy(nq[2], tmp[2]);
         skip = 0;
       }
     }
   }

   fiat_p256_to_generic(&r->X, nq[0]);
   fiat_p256_to_generic(&r->Y, nq[1]);
   fiat_p256_to_generic(&r->Z, nq[2]);
 }

 static void ec_GFp_nistp256_point_mul_base(const EC_GROUP *group,
                                            EC_JACOBIAN *r,
                                            const EC_SCALAR *scalar) {
   // Set nq to the point at infinity.
   fiat_p256_felem nq[3] = {{0}, {0}, {0}}, tmp[3];

   int skip = 1;  // Save two point operations in the first round.
   for (size_t i = 31; i < 32; i--) {
     if (!skip) {
       fiat_p256_point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
     }

     // First, look 32 bits upwards.
     crypto_word_t bits = fiat_p256_get_bit(scalar, i + 224) << 3;
     bits |= fiat_p256_get_bit(scalar, i + 160) << 2;
     bits |= fiat_p256_get_bit(scalar, i + 96) << 1;
     bits |= fiat_p256_get_bit(scalar, i + 32);
     // Select the point to add, in constant time.
     fiat_p256_select_point_affine((fiat_p256_limb_t)bits, 15,
                                   fiat_p256_g_pre_comp[1], tmp);

     if (!skip) {
       fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], tmp[0],
                           tmp[1], tmp[2]);
     } else {
       fiat_p256_copy(nq[0], tmp[0]);
       fiat_p256_copy(nq[1], tmp[1]);
       fiat_p256_copy(nq[2], tmp[2]);
       skip = 0;
     }

     // Second, look at the current position.
     bits = fiat_p256_get_bit(scalar, i + 192) << 3;
     bits |= fiat_p256_get_bit(scalar, i + 128) << 2;
     bits |= fiat_p256_get_bit(scalar, i + 64) << 1;
     bits |= fiat_p256_get_bit(scalar, i);
     // Select the point to add, in constant time.
     fiat_p256_select_point_affine((fiat_p256_limb_t)bits, 15,
                                   fiat_p256_g_pre_comp[0], tmp);
     fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], tmp[0],
                         tmp[1], tmp[2]);
   }

   fiat_p256_to_generic(&r->X, nq[0]);
   fiat_p256_to_generic(&r->Y, nq[1]);
   fiat_p256_to_generic(&r->Z, nq[2]);
 }

 static void ec_GFp_nistp256_point_mul_public(const EC_GROUP *group,
                                              EC_JACOBIAN *r,
                                              const EC_SCALAR *g_scalar,
                                              const EC_JACOBIAN *p,
                                              const EC_SCALAR *p_scalar) {
 #define P256_WSIZE_PUBLIC 4
   // Precompute multiples of |p|. p_pre_comp[i] is (2*i+1) * |p|.
   fiat_p256_felem p_pre_comp[1 << (P256_WSIZE_PUBLIC - 1)][3];
   fiat_p256_from_generic(p_pre_comp[0][0], &p->X);
   fiat_p256_from_generic(p_pre_comp[0][1], &p->Y);
   fiat_p256_from_generic(p_pre_comp[0][2], &p->Z);
   fiat_p256_felem p2[3];
   fiat_p256_point_double(p2[0], p2[1], p2[2], p_pre_comp[0][0],
                          p_pre_comp[0][1], p_pre_comp[0][2]);
   for (size_t i = 1; i < std::size(p_pre_comp); i++) {
     fiat_p256_point_add(p_pre_comp[i][0], p_pre_comp[i][1], p_pre_comp[i][2],
                         p_pre_comp[i - 1][0], p_pre_comp[i - 1][1],
                         p_pre_comp[i - 1][2], p2[0], p2[1], p2[2]);
   }

   // Set up the coefficients for |p_scalar|.
   int8_t p_wNAF[257];
   ec_compute_wNAF(group, p_wNAF, p_scalar, 256, P256_WSIZE_PUBLIC);

   // Set |ret| to the point at infinity.
   int skip = 1;  // Save some point operations.
   fiat_p256_felem ret[3] = {{0}, {0}, {0}};
   for (int i = 256; i >= 0; i--) {
     if (!skip) {
       fiat_p256_point_double(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2]);
     }

     // For the |g_scalar|, we use the precomputed table without the
     // constant-time lookup.
     if (i <= 31) {
       // First, look 32 bits upwards.
       crypto_word_t bits = fiat_p256_get_bit(g_scalar, i + 224) << 3;
       bits |= fiat_p256_get_bit(g_scalar, i + 160) << 2;
       bits |= fiat_p256_get_bit(g_scalar, i + 96) << 1;
       bits |= fiat_p256_get_bit(g_scalar, i + 32);
       if (bits != 0) {
         size_t index = (size_t)(bits - 1);
         fiat_p256_point_add(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2],
                             fiat_p256_g_pre_comp[1][index][0],
                             fiat_p256_g_pre_comp[1][index][1], fiat_p256_one);
         skip = 0;
       }

       // Second, look at the current position.
       bits = fiat_p256_get_bit(g_scalar, i + 192) << 3;
       bits |= fiat_p256_get_bit(g_scalar, i + 128) << 2;
       bits |= fiat_p256_get_bit(g_scalar, i + 64) << 1;
       bits |= fiat_p256_get_bit(g_scalar, i);
       if (bits != 0) {
         size_t index = (size_t)(bits - 1);
         fiat_p256_point_add(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2],
                             fiat_p256_g_pre_comp[0][index][0],
                             fiat_p256_g_pre_comp[0][index][1], fiat_p256_one);
         skip = 0;
       }
     }

     int digit = p_wNAF[i];
     if (digit != 0) {
       assert(digit & 1);
       size_t idx = (size_t)(digit < 0 ? (-digit) >> 1 : digit >> 1);
       fiat_p256_felem *y = &p_pre_comp[idx][1], tmp;
       if (digit < 0) {
         fiat_p256_opp(tmp, p_pre_comp[idx][1]);
         y = &tmp;
       }
       if (!skip) {
         fiat_p256_point_add(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2],
                             p_pre_comp[idx][0], *y, p_pre_comp[idx][2]);
       } else {
         fiat_p256_copy(ret[0], p_pre_comp[idx][0]);
         fiat_p256_copy(ret[1], *y);
         fiat_p256_copy(ret[2], p_pre_comp[idx][2]);
         skip = 0;
       }
     }
   }

   fiat_p256_to_generic(&r->X, ret[0]);
   fiat_p256_to_generic(&r->Y, ret[1]);
   fiat_p256_to_generic(&r->Z, ret[2]);
 }

 static int ec_GFp_nistp256_cmp_x_coordinate(const EC_GROUP *group,
                                             const EC_JACOBIAN *p,
                                             const EC_SCALAR *r) {
   if (ec_GFp_simple_is_at_infinity(group, p)) {
     return 0;
   }

   // We wish to compare X/Z^2 with r. This is equivalent to comparing X with
   // r*Z^2. Note that X and Z are represented in Montgomery form, while r is
   // not.
   fiat_p256_felem Z2_mont;
   fiat_p256_from_generic(Z2_mont, &p->Z);
   fiat_p256_mul(Z2_mont, Z2_mont, Z2_mont);

   fiat_p256_felem r_Z2;
   fiat_p256_from_words(r_Z2, r->words);  // r < order < p, so this is valid.
   fiat_p256_mul(r_Z2, r_Z2, Z2_mont);

   fiat_p256_felem X;
   fiat_p256_from_generic(X, &p->X);
   fiat_p256_from_montgomery(X, X);

   if (OPENSSL_memcmp(&r_Z2, &X, sizeof(r_Z2)) == 0) {
     return 1;
   }

   // During signing the x coefficient is reduced modulo the group order.
   // Therefore there is a small possibility, less than 1/2^128, that group_order
   // < p.x < P. in that case we need not only to compare against |r| but also to
   // compare against r+group_order.
   assert(group->field.N.width == group->order.N.width);
   EC_FELEM tmp;
   BN_ULONG carry =
       bn_add_words(tmp.words, r->words, group->order.N.d, group->field.N.width);
   if (carry == 0 &&
       bn_less_than_words(tmp.words, group->field.N.d, group->field.N.width)) {
     fiat_p256_from_generic(r_Z2, &tmp);
     fiat_p256_mul(r_Z2, r_Z2, Z2_mont);
     if (OPENSSL_memcmp(&r_Z2, &X, sizeof(r_Z2)) == 0) {
       return 1;
     }
   }

   return 0;
 }

 DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistp256_method) {
   out->point_get_affine_coordinates =
       ec_GFp_nistp256_point_get_affine_coordinates;
   out->add = ec_GFp_nistp256_add;
   out->dbl = ec_GFp_nistp256_dbl;
   out->mul = ec_GFp_nistp256_point_mul;
   out->mul_base = ec_GFp_nistp256_point_mul_base;
   out->mul_public = ec_GFp_nistp256_point_mul_public;
   out->felem_mul = ec_GFp_mont_felem_mul;
   out->felem_sqr = ec_GFp_mont_felem_sqr;
   out->felem_to_bytes = ec_GFp_mont_felem_to_bytes;
   out->felem_from_bytes = ec_GFp_mont_felem_from_bytes;
   out->felem_reduce = ec_GFp_mont_felem_reduce;
   // TODO(davidben): This should use the specialized field arithmetic
   // implementation, rather than the generic one.
   out->felem_exp = ec_GFp_mont_felem_exp;
   out->scalar_inv0_montgomery = ec_simple_scalar_inv0_montgomery;
   out->scalar_to_montgomery_inv_vartime =
       ec_simple_scalar_to_montgomery_inv_vartime;
   out->cmp_x_coordinate = ec_GFp_nistp256_cmp_x_coordinate;
 }
	// Copyright 2020 The BoringSSL Authors
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// https://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	// An implementation of the NIST P-256 elliptic curve point multiplication.
	// 256-bit Montgomery form for 64 and 32-bit. Field operations are generated by
	// Fiat, which lives in //third_party/fiat.

	#include <openssl/base.h>

	#include <openssl/bn.h>
	#include <openssl/ec.h>
	#include <openssl/err.h>
	#include <openssl/mem.h>

	#include <assert.h>
	#include <string.h>

	#include <iterator>

	#include "../../internal.h"
	#include "../delocate.h"
	#include "./internal.h"

	#include "../../../third_party/fiat/p256_field.c.inc"
	#include "../../../third_party/fiat/p256_point.br.c.inc"

	// utility functions, handwritten

	#if defined(OPENSSL_64_BIT)
	#define FIAT_P256_NLIMBS 4
	typedef uint64_t fiat_p256_limb_t;
	typedef uint64_t fiat_p256_felem[FIAT_P256_NLIMBS];
	static const fiat_p256_felem fiat_p256_one = {0x1, 0xffffffff00000000,
	0xffffffffffffffff, 0xfffffffe};
	#else // 64BIT; else 32BIT
	#define FIAT_P256_NLIMBS 8
	typedef uint32_t fiat_p256_limb_t;
	typedef uint32_t fiat_p256_felem[FIAT_P256_NLIMBS];
	static const fiat_p256_felem fiat_p256_one = {
	0x1, 0x0, 0x0, 0xffffffff, 0xffffffff, 0xffffffff, 0xfffffffe, 0x0};
	#endif // 64BIT


	static void fiat_p256_copy(fiat_p256_limb_t out[FIAT_P256_NLIMBS],
	const fiat_p256_limb_t in1[FIAT_P256_NLIMBS]) {
	for (size_t i = 0; i < FIAT_P256_NLIMBS; i++) {
	out[i] = in1[i];
	}
	}

	static void fiat_p256_cmovznz(fiat_p256_limb_t out[FIAT_P256_NLIMBS],
	fiat_p256_limb_t t,
	const fiat_p256_limb_t z[FIAT_P256_NLIMBS],
	const fiat_p256_limb_t nz[FIAT_P256_NLIMBS]) {
	fiat_p256_selectznz(out, !!t, z, nz);
	}

	static void fiat_p256_from_words(fiat_p256_felem out,
	const BN_ULONG in[32 / sizeof(BN_ULONG)]) {
	// Typically, \|BN_ULONG\| and \|fiat_p256_limb_t\| will be the same type, but on
	// 64-bit platforms without \|uint128_t\|, they are different. However, on
	// little-endian systems, \|uint64_t[4]\| and \|uint32_t[8]\| have the same
	// layout.
	OPENSSL_memcpy(out, in, 32);
	}

	static void fiat_p256_from_generic(fiat_p256_felem out, const EC_FELEM *in) {
	fiat_p256_from_words(out, in->words);
	}

	static void fiat_p256_to_generic(EC_FELEM *out, const fiat_p256_felem in) {
	// See \|fiat_p256_from_words\|.
	OPENSSL_memcpy(out->words, in, 32);
	}

	// fiat_p256_inv_square calculates \|out\| = \|in\|^{-2}
	//
	// Based on Fermat's Little Theorem:
	// a^p = a (mod p)
	// a^{p-1} = 1 (mod p)
	// a^{p-3} = a^{-2} (mod p)
	static void fiat_p256_inv_square(fiat_p256_felem out,
	const fiat_p256_felem in) {
	// This implements the addition chain described in
	// https://briansmith.org/ecc-inversion-addition-chains-01#p256_field_inversion
	fiat_p256_felem x2, x3, x6, x12, x15, x30, x32;
	fiat_p256_square(x2, in); // 2^2 - 2^1
	fiat_p256_mul(x2, x2, in); // 2^2 - 2^0

	fiat_p256_square(x3, x2); // 2^3 - 2^1
	fiat_p256_mul(x3, x3, in); // 2^3 - 2^0

	fiat_p256_square(x6, x3);
	for (int i = 1; i < 3; i++) {
	fiat_p256_square(x6, x6);
	} // 2^6 - 2^3
	fiat_p256_mul(x6, x6, x3); // 2^6 - 2^0

	fiat_p256_square(x12, x6);
	for (int i = 1; i < 6; i++) {
	fiat_p256_square(x12, x12);
	} // 2^12 - 2^6
	fiat_p256_mul(x12, x12, x6); // 2^12 - 2^0

	fiat_p256_square(x15, x12);
	for (int i = 1; i < 3; i++) {
	fiat_p256_square(x15, x15);
	} // 2^15 - 2^3
	fiat_p256_mul(x15, x15, x3); // 2^15 - 2^0

	fiat_p256_square(x30, x15);
	for (int i = 1; i < 15; i++) {
	fiat_p256_square(x30, x30);
	} // 2^30 - 2^15
	fiat_p256_mul(x30, x30, x15); // 2^30 - 2^0

	fiat_p256_square(x32, x30);
	fiat_p256_square(x32, x32); // 2^32 - 2^2
	fiat_p256_mul(x32, x32, x2); // 2^32 - 2^0

	fiat_p256_felem ret;
	fiat_p256_square(ret, x32);
	for (int i = 1; i < 31 + 1; i++) {
	fiat_p256_square(ret, ret);
	} // 2^64 - 2^32
	fiat_p256_mul(ret, ret, in); // 2^64 - 2^32 + 2^0

	for (int i = 0; i < 96 + 32; i++) {
	fiat_p256_square(ret, ret);
	} // 2^192 - 2^160 + 2^128
	fiat_p256_mul(ret, ret, x32); // 2^192 - 2^160 + 2^128 + 2^32 - 2^0

	for (int i = 0; i < 32; i++) {
	fiat_p256_square(ret, ret);
	} // 2^224 - 2^192 + 2^160 + 2^64 - 2^32
	fiat_p256_mul(ret, ret, x32); // 2^224 - 2^192 + 2^160 + 2^64 - 2^0

	for (int i = 0; i < 30; i++) {
	fiat_p256_square(ret, ret);
	} // 2^254 - 2^222 + 2^190 + 2^94 - 2^30
	fiat_p256_mul(ret, ret, x30); // 2^254 - 2^222 + 2^190 + 2^94 - 2^0

	fiat_p256_square(ret, ret);
	fiat_p256_square(out, ret); // 2^256 - 2^224 + 2^192 + 2^96 - 2^2
	}

	// Group operations
	// ----------------
	//
	// Building on top of the field operations we have the operations on the
	// elliptic curve group itself. Points on the curve are represented in Jacobian
	// coordinates.

	static void fiat_p256_point_double(fiat_p256_felem x_out, fiat_p256_felem y_out,
	fiat_p256_felem z_out,
	const fiat_p256_felem x_in,
	const fiat_p256_felem y_in,
	const fiat_p256_felem z_in) {
	uint8_t out[332], in[332];
	static_assert(sizeof(fiat_p256_felem) == 32);
	OPENSSL_memcpy(&in[0], x_in, 32);
	OPENSSL_memcpy(&in[32], y_in, 32);
	OPENSSL_memcpy(&in[64], z_in, 32);
	p256_point_double((br_word_t)out, (br_word_t)in);
	OPENSSL_memcpy(x_out, &out[0], 32);
	OPENSSL_memcpy(y_out, &out[32], 32);
	OPENSSL_memcpy(z_out, &out[64], 32);
	}

	static void fiat_p256_point_add(fiat_p256_felem x3, fiat_p256_felem y3,
	fiat_p256_felem z3, const fiat_p256_felem x1,
	const fiat_p256_felem y1,
	const fiat_p256_felem z1,
	const fiat_p256_felem x2,
	const fiat_p256_felem y2,
	const fiat_p256_felem z2) {
	uint8_t out[3 * 32], in1[3 * 32], in2[3 * 32];
	static_assert(sizeof(fiat_p256_felem) == 32);
	OPENSSL_memcpy(&in1[0], x1, 32);
	OPENSSL_memcpy(&in1[32], y1, 32);
	OPENSSL_memcpy(&in1[64], z1, 32);
	OPENSSL_memcpy(&in2[0], x2, 32);
	OPENSSL_memcpy(&in2[32], y2, 32);
	OPENSSL_memcpy(&in2[64], z2, 32);
	p256_point_add_vartime_if_doubling((br_word_t)out, (br_word_t)in1,
	(br_word_t)in2);
	OPENSSL_memcpy(x3, &out[0], 32);
	OPENSSL_memcpy(y3, &out[32], 32);
	OPENSSL_memcpy(z3, &out[64], 32);
	}
	#include "./p256_table.h"

	// fiat_p256_select_point_affine selects the \|idx-1\|th point from a
	// precomputation table and copies it to out. If \|idx\| is zero, the output is
	// the point at infinity.
	static void fiat_p256_select_point_affine(
	const fiat_p256_limb_t idx, size_t size,
	const fiat_p256_felem pre_comp[/size/][2], fiat_p256_felem out[3]) {
	OPENSSL_memset(out, 0, sizeof(fiat_p256_felem) * 3);
	for (size_t i = 0; i < size; i++) {
	fiat_p256_limb_t mismatch = i ^ (idx - 1);
	fiat_p256_cmovznz(out[0], mismatch, pre_comp[i][0], out[0]);
	fiat_p256_cmovznz(out[1], mismatch, pre_comp[i][1], out[1]);
	}
	fiat_p256_cmovznz(out[2], idx, out[2], fiat_p256_one);
	}

	// fiat_p256_select_point selects the \|idx\|th point from a precomputation table
	// and copies it to out.
	static void fiat_p256_select_point(const fiat_p256_limb_t idx, size_t size,
	const fiat_p256_felem pre_comp[/size/][3],
	fiat_p256_felem out[3]) {
	OPENSSL_memset(out, 0, sizeof(fiat_p256_felem) * 3);
	for (size_t i = 0; i < size; i++) {
	fiat_p256_limb_t mismatch = i ^ idx;
	fiat_p256_cmovznz(out[0], mismatch, pre_comp[i][0], out[0]);
	fiat_p256_cmovznz(out[1], mismatch, pre_comp[i][1], out[1]);
	fiat_p256_cmovznz(out[2], mismatch, pre_comp[i][2], out[2]);
	}
	}

	// fiat_p256_get_bit returns the \|i\|th bit in \|in\|.
	static crypto_word_t fiat_p256_get_bit(const EC_SCALAR *in, int i) {
	if (i < 0 \|\| i >= 256) {
	return 0;
	}
	#if defined(OPENSSL_64_BIT)
	static_assert(sizeof(BN_ULONG) == 8, "BN_ULONG was not 64-bit");
	return (in->words[i >> 6] >> (i & 63)) & 1;
	#else
	static_assert(sizeof(BN_ULONG) == 4, "BN_ULONG was not 32-bit");
	return (in->words[i >> 5] >> (i & 31)) & 1;
	#endif
	}

	// OPENSSL EC_METHOD FUNCTIONS

	// Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
	// (X/Z^2, Y/Z^3).
	static int ec_GFp_nistp256_point_get_affine_coordinates(
	const EC_GROUP group, const EC_JACOBIAN point, EC_FELEM *x_out,
	EC_FELEM *y_out) {
	if (constant_time_declassify_int(
	ec_GFp_simple_is_at_infinity(group, point))) {
	OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY);
	return 0;
	}

	fiat_p256_felem z1, z2;
	fiat_p256_from_generic(z1, &point->Z);
	fiat_p256_inv_square(z2, z1);

	if (x_out != NULL) {
	fiat_p256_felem x;
	fiat_p256_from_generic(x, &point->X);
	fiat_p256_mul(x, x, z2);
	fiat_p256_to_generic(x_out, x);
	}

	if (y_out != NULL) {
	fiat_p256_felem y;
	fiat_p256_from_generic(y, &point->Y);
	fiat_p256_square(z2, z2); // z^-4
	fiat_p256_mul(y, y, z1); // y * z
	fiat_p256_mul(y, y, z2); // y * z^-3
	fiat_p256_to_generic(y_out, y);
	}

	return 1;
	}

	static void ec_GFp_nistp256_add(const EC_GROUP group, EC_JACOBIAN r,
	const EC_JACOBIAN a, const EC_JACOBIAN b) {
	fiat_p256_felem x1, y1, z1, x2, y2, z2;
	fiat_p256_from_generic(x1, &a->X);
	fiat_p256_from_generic(y1, &a->Y);
	fiat_p256_from_generic(z1, &a->Z);
	fiat_p256_from_generic(x2, &b->X);
	fiat_p256_from_generic(y2, &b->Y);
	fiat_p256_from_generic(z2, &b->Z);
	fiat_p256_point_add(x1, y1, z1, x1, y1, z1, x2, y2, z2);
	fiat_p256_to_generic(&r->X, x1);
	fiat_p256_to_generic(&r->Y, y1);
	fiat_p256_to_generic(&r->Z, z1);
	}

	static void ec_GFp_nistp256_dbl(const EC_GROUP group, EC_JACOBIAN r,
	const EC_JACOBIAN *a) {
	fiat_p256_felem x, y, z;
	fiat_p256_from_generic(x, &a->X);
	fiat_p256_from_generic(y, &a->Y);
	fiat_p256_from_generic(z, &a->Z);
	fiat_p256_point_double(x, y, z, x, y, z);
	fiat_p256_to_generic(&r->X, x);
	fiat_p256_to_generic(&r->Y, y);
	fiat_p256_to_generic(&r->Z, z);
	}

	static void ec_GFp_nistp256_point_mul(const EC_GROUP group, EC_JACOBIAN r,
	const EC_JACOBIAN *p,
	const EC_SCALAR *scalar) {
	fiat_p256_felem p_pre_comp[17][3];
	OPENSSL_memset(&p_pre_comp, 0, sizeof(p_pre_comp));
	// Precompute multiples.
	fiat_p256_from_generic(p_pre_comp[1][0], &p->X);
	fiat_p256_from_generic(p_pre_comp[1][1], &p->Y);
	fiat_p256_from_generic(p_pre_comp[1][2], &p->Z);
	for (size_t j = 2; j <= 16; ++j) {
	if (j & 1) {
	fiat_p256_point_add(p_pre_comp[j][0], p_pre_comp[j][1], p_pre_comp[j][2],
	p_pre_comp[1][0], p_pre_comp[1][1], p_pre_comp[1][2],
	p_pre_comp[j - 1][0], p_pre_comp[j - 1][1],
	p_pre_comp[j - 1][2]);
	} else {
	fiat_p256_point_double(p_pre_comp[j][0], p_pre_comp[j][1],
	p_pre_comp[j][2], p_pre_comp[j / 2][0],
	p_pre_comp[j / 2][1], p_pre_comp[j / 2][2]);
	}
	}

	// Set nq to the point at infinity.
	fiat_p256_felem nq[3] = {{0}, {0}, {0}}, ftmp, tmp[3];

	// Loop over \|scalar\| msb-to-lsb, incorporating \|p_pre_comp\| every 5th round.
	int skip = 1; // Save two point operations in the first round.
	for (size_t i = 255; i < 256; i--) {
	// double
	if (!skip) {
	fiat_p256_point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
	}

	// do other additions every 5 doublings
	if (i % 5 == 0) {
	crypto_word_t bits = fiat_p256_get_bit(scalar, i + 4) << 5;
	bits \|= fiat_p256_get_bit(scalar, i + 3) << 4;
	bits \|= fiat_p256_get_bit(scalar, i + 2) << 3;
	bits \|= fiat_p256_get_bit(scalar, i + 1) << 2;
	bits \|= fiat_p256_get_bit(scalar, i) << 1;
	bits \|= fiat_p256_get_bit(scalar, i - 1);
	crypto_word_t sign, digit;
	ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);

	// select the point to add or subtract, in constant time.
	fiat_p256_select_point((fiat_p256_limb_t)digit, 17,
	(const fiat_p256_felem(*)[3])p_pre_comp, tmp);
	fiat_p256_opp(ftmp, tmp[1]); // (X, -Y, Z) is the negative point.
	fiat_p256_cmovznz(tmp[1], (fiat_p256_limb_t)sign, tmp[1], ftmp);

	if (!skip) {
	fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], tmp[0],
	tmp[1], tmp[2]);
	} else {
	fiat_p256_copy(nq[0], tmp[0]);
	fiat_p256_copy(nq[1], tmp[1]);
	fiat_p256_copy(nq[2], tmp[2]);
	skip = 0;
	}
	}
	}

	fiat_p256_to_generic(&r->X, nq[0]);
	fiat_p256_to_generic(&r->Y, nq[1]);
	fiat_p256_to_generic(&r->Z, nq[2]);
	}

	static void ec_GFp_nistp256_point_mul_base(const EC_GROUP *group,
	EC_JACOBIAN *r,
	const EC_SCALAR *scalar) {
	// Set nq to the point at infinity.
	fiat_p256_felem nq[3] = {{0}, {0}, {0}}, tmp[3];

	int skip = 1; // Save two point operations in the first round.
	for (size_t i = 31; i < 32; i--) {
	if (!skip) {
	fiat_p256_point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
	}

	// First, look 32 bits upwards.
	crypto_word_t bits = fiat_p256_get_bit(scalar, i + 224) << 3;
	bits \|= fiat_p256_get_bit(scalar, i + 160) << 2;
	bits \|= fiat_p256_get_bit(scalar, i + 96) << 1;
	bits \|= fiat_p256_get_bit(scalar, i + 32);
	// Select the point to add, in constant time.
	fiat_p256_select_point_affine((fiat_p256_limb_t)bits, 15,
	fiat_p256_g_pre_comp[1], tmp);

	if (!skip) {
	fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], tmp[0],
	tmp[1], tmp[2]);
	} else {
	fiat_p256_copy(nq[0], tmp[0]);
	fiat_p256_copy(nq[1], tmp[1]);
	fiat_p256_copy(nq[2], tmp[2]);
	skip = 0;
	}

	// Second, look at the current position.
	bits = fiat_p256_get_bit(scalar, i + 192) << 3;
	bits \|= fiat_p256_get_bit(scalar, i + 128) << 2;
	bits \|= fiat_p256_get_bit(scalar, i + 64) << 1;
	bits \|= fiat_p256_get_bit(scalar, i);
	// Select the point to add, in constant time.
	fiat_p256_select_point_affine((fiat_p256_limb_t)bits, 15,
	fiat_p256_g_pre_comp[0], tmp);
	fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], tmp[0],
	tmp[1], tmp[2]);
	}

	fiat_p256_to_generic(&r->X, nq[0]);
	fiat_p256_to_generic(&r->Y, nq[1]);
	fiat_p256_to_generic(&r->Z, nq[2]);
	}

	static void ec_GFp_nistp256_point_mul_public(const EC_GROUP *group,
	EC_JACOBIAN *r,
	const EC_SCALAR *g_scalar,
	const EC_JACOBIAN *p,
	const EC_SCALAR *p_scalar) {
	#define P256_WSIZE_PUBLIC 4
	// Precompute multiples of \|p\|. p_pre_comp[i] is (2i+1) \|p\|.
	fiat_p256_felem p_pre_comp[1 << (P256_WSIZE_PUBLIC - 1)][3];
	fiat_p256_from_generic(p_pre_comp[0][0], &p->X);
	fiat_p256_from_generic(p_pre_comp[0][1], &p->Y);
	fiat_p256_from_generic(p_pre_comp[0][2], &p->Z);
	fiat_p256_felem p2[3];
	fiat_p256_point_double(p2[0], p2[1], p2[2], p_pre_comp[0][0],
	p_pre_comp[0][1], p_pre_comp[0][2]);
	for (size_t i = 1; i < std::size(p_pre_comp); i++) {
	fiat_p256_point_add(p_pre_comp[i][0], p_pre_comp[i][1], p_pre_comp[i][2],
	p_pre_comp[i - 1][0], p_pre_comp[i - 1][1],
	p_pre_comp[i - 1][2], p2[0], p2[1], p2[2]);
	}

	// Set up the coefficients for \|p_scalar\|.
	int8_t p_wNAF[257];
	ec_compute_wNAF(group, p_wNAF, p_scalar, 256, P256_WSIZE_PUBLIC);

	// Set \|ret\| to the point at infinity.
	int skip = 1; // Save some point operations.
	fiat_p256_felem ret[3] = {{0}, {0}, {0}};
	for (int i = 256; i >= 0; i--) {
	if (!skip) {
	fiat_p256_point_double(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2]);
	}

	// For the \|g_scalar\|, we use the precomputed table without the
	// constant-time lookup.
	if (i <= 31) {
	// First, look 32 bits upwards.
	crypto_word_t bits = fiat_p256_get_bit(g_scalar, i + 224) << 3;
	bits \|= fiat_p256_get_bit(g_scalar, i + 160) << 2;
	bits \|= fiat_p256_get_bit(g_scalar, i + 96) << 1;
	bits \|= fiat_p256_get_bit(g_scalar, i + 32);
	if (bits != 0) {
	size_t index = (size_t)(bits - 1);
	fiat_p256_point_add(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2],
	fiat_p256_g_pre_comp[1][index][0],
	fiat_p256_g_pre_comp[1][index][1], fiat_p256_one);
	skip = 0;
	}

	// Second, look at the current position.
	bits = fiat_p256_get_bit(g_scalar, i + 192) << 3;
	bits \|= fiat_p256_get_bit(g_scalar, i + 128) << 2;
	bits \|= fiat_p256_get_bit(g_scalar, i + 64) << 1;
	bits \|= fiat_p256_get_bit(g_scalar, i);
	if (bits != 0) {
	size_t index = (size_t)(bits - 1);
	fiat_p256_point_add(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2],
	fiat_p256_g_pre_comp[0][index][0],
	fiat_p256_g_pre_comp[0][index][1], fiat_p256_one);
	skip = 0;
	}
	}

	int digit = p_wNAF[i];
	if (digit != 0) {
	assert(digit & 1);
	size_t idx = (size_t)(digit < 0 ? (-digit) >> 1 : digit >> 1);
	fiat_p256_felem *y = &p_pre_comp[idx][1], tmp;
	if (digit < 0) {
	fiat_p256_opp(tmp, p_pre_comp[idx][1]);
	y = &tmp;
	}
	if (!skip) {
	fiat_p256_point_add(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2],
	p_pre_comp[idx][0], *y, p_pre_comp[idx][2]);
	} else {
	fiat_p256_copy(ret[0], p_pre_comp[idx][0]);
	fiat_p256_copy(ret[1], *y);
	fiat_p256_copy(ret[2], p_pre_comp[idx][2]);
	skip = 0;
	}
	}
	}

	fiat_p256_to_generic(&r->X, ret[0]);
	fiat_p256_to_generic(&r->Y, ret[1]);
	fiat_p256_to_generic(&r->Z, ret[2]);
	}

	static int ec_GFp_nistp256_cmp_x_coordinate(const EC_GROUP *group,
	const EC_JACOBIAN *p,
	const EC_SCALAR *r) {
	if (ec_GFp_simple_is_at_infinity(group, p)) {
	return 0;
	}

	// We wish to compare X/Z^2 with r. This is equivalent to comparing X with
	// r*Z^2. Note that X and Z are represented in Montgomery form, while r is
	// not.
	fiat_p256_felem Z2_mont;
	fiat_p256_from_generic(Z2_mont, &p->Z);
	fiat_p256_mul(Z2_mont, Z2_mont, Z2_mont);

	fiat_p256_felem r_Z2;
	fiat_p256_from_words(r_Z2, r->words); // r < order < p, so this is valid.
	fiat_p256_mul(r_Z2, r_Z2, Z2_mont);

	fiat_p256_felem X;
	fiat_p256_from_generic(X, &p->X);
	fiat_p256_from_montgomery(X, X);

	if (OPENSSL_memcmp(&r_Z2, &X, sizeof(r_Z2)) == 0) {
	return 1;
	}

	// During signing the x coefficient is reduced modulo the group order.
	// Therefore there is a small possibility, less than 1/2^128, that group_order
	// < p.x < P. in that case we need not only to compare against \|r\| but also to
	// compare against r+group_order.
	assert(group->field.N.width == group->order.N.width);
	EC_FELEM tmp;
	BN_ULONG carry =
	bn_add_words(tmp.words, r->words, group->order.N.d, group->field.N.width);
	if (carry == 0 &&
	bn_less_than_words(tmp.words, group->field.N.d, group->field.N.width)) {
	fiat_p256_from_generic(r_Z2, &tmp);
	fiat_p256_mul(r_Z2, r_Z2, Z2_mont);
	if (OPENSSL_memcmp(&r_Z2, &X, sizeof(r_Z2)) == 0) {
	return 1;
	}
	}

	return 0;
	}

	DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistp256_method) {
	out->point_get_affine_coordinates =
	ec_GFp_nistp256_point_get_affine_coordinates;
	out->add = ec_GFp_nistp256_add;
	out->dbl = ec_GFp_nistp256_dbl;
	out->mul = ec_GFp_nistp256_point_mul;
	out->mul_base = ec_GFp_nistp256_point_mul_base;
	out->mul_public = ec_GFp_nistp256_point_mul_public;
	out->felem_mul = ec_GFp_mont_felem_mul;
	out->felem_sqr = ec_GFp_mont_felem_sqr;
	out->felem_to_bytes = ec_GFp_mont_felem_to_bytes;
	out->felem_from_bytes = ec_GFp_mont_felem_from_bytes;
	out->felem_reduce = ec_GFp_mont_felem_reduce;
	// TODO(davidben): This should use the specialized field arithmetic
	// implementation, rather than the generic one.
	out->felem_exp = ec_GFp_mont_felem_exp;
	out->scalar_inv0_montgomery = ec_simple_scalar_inv0_montgomery;
	out->scalar_to_montgomery_inv_vartime =
	ec_simple_scalar_to_montgomery_inv_vartime;
	out->cmp_x_coordinate = ec_GFp_nistp256_cmp_x_coordinate;
	}