Use fiat-crypto's freeze function for fe_tobytes.
It requires a handful of additional intrinsics for now.
Fiat's freeze function only works on the tight bounds, so fe_isnonzero
gains an extra fe_carry. But all other calls of fe_tobytes are of tight
bounds anyway.
Change-Id: I834858cee7863c7344e456d7a7dbf4f414f04ae5
Reviewed-on: https://boringssl-review.googlesource.com/24545
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/third_party/fiat/README.md b/third_party/fiat/README.md
index eaf0a60..db67dc9 100644
--- a/third_party/fiat/README.md
+++ b/third_party/fiat/README.md
@@ -7,7 +7,7 @@
## Curve25519
To generate the field arithmetic procedures in `curve25519.c` from a fiat-crypto
-checkout (as of `c47f48268f15e202a28b556845f231b2038cb426`), run
+checkout (as of `693d62c6fd7370bf71b8eb3b9a5825dfd071fcac`), run
`make src/Specific/solinas32_2e255m19_10limbs/femul.c` (replacing `femul` with
the desired field operation). The "source" file specifying the finite field and
referencing the desired implementation strategy is
diff --git a/third_party/fiat/curve25519.c b/third_party/fiat/curve25519.c
index 2e56450..73d76a0 100644
--- a/third_party/fiat/curve25519.c
+++ b/third_party/fiat/curve25519.c
@@ -41,9 +41,6 @@
#include "../../crypto/internal.h"
-static const int64_t kBottom25Bits = INT64_C(0x1ffffff);
-static const int64_t kBottom26Bits = INT64_C(0x3ffffff);
-
static uint64_t load_3(const uint8_t *in) {
uint64_t result;
result = (uint64_t)in[0];
@@ -73,6 +70,12 @@
} \
} while (0)
+#define assert_fe_frozen(f) do { \
+ for (unsigned _assert_fe_i = 0; _assert_fe_i< 10; _assert_fe_i++) { \
+ assert(f[_assert_fe_i] < (1u<<(26-(_assert_fe_i&1)))); \
+ } \
+} while (0)
+
static void fe_frombytes_impl(uint32_t h[10], const uint8_t *s) {
// Ignores top bit of s.
uint32_t a0 = load_4(s);
@@ -100,115 +103,140 @@
fe_frombytes_impl(h->v, s);
}
-// Preconditions:
-// |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
-//
-// Write p=2^255-19; q=floor(h/p).
-// Basic claim: q = floor(2^(-255)(h + 19 2^(-25)h9 + 2^(-1))).
-//
-// Proof:
-// Have |h|<=p so |q|<=1 so |19^2 2^(-255) q|<1/4.
-// Also have |h-2^230 h9|<2^231 so |19 2^(-255)(h-2^230 h9)|<1/4.
-//
-// Write y=2^(-1)-19^2 2^(-255)q-19 2^(-255)(h-2^230 h9).
-// Then 0<y<1.
-//
-// Write r=h-pq.
-// Have 0<=r<=p-1=2^255-20.
-// Thus 0<=r+19(2^-255)r<r+19(2^-255)2^255<=2^255-1.
-//
-// Write x=r+19(2^-255)r+y.
-// Then 0<x<2^255 so floor(2^(-255)x) = 0 so floor(q+2^(-255)x) = q.
-//
-// Have q+2^(-255)x = 2^(-255)(h + 19 2^(-25) h9 + 2^(-1))
-// so floor(2^(-255)(h + 19 2^(-25) h9 + 2^(-1))) = q.
-static void fe_tobytes_impl(uint8_t s[32], const uint32_t h[10]) {
- assert_fe_loose(h);
- int32_t h0 = h[0];
- int32_t h1 = h[1];
- int32_t h2 = h[2];
- int32_t h3 = h[3];
- int32_t h4 = h[4];
- int32_t h5 = h[5];
- int32_t h6 = h[6];
- int32_t h7 = h[7];
- int32_t h8 = h[8];
- int32_t h9 = h[9];
- int32_t q;
-
- q = (19 * h9 + (((int32_t) 1) << 24)) >> 25;
- q = (h0 + q) >> 26;
- q = (h1 + q) >> 25;
- q = (h2 + q) >> 26;
- q = (h3 + q) >> 25;
- q = (h4 + q) >> 26;
- q = (h5 + q) >> 25;
- q = (h6 + q) >> 26;
- q = (h7 + q) >> 25;
- q = (h8 + q) >> 26;
- q = (h9 + q) >> 25;
-
- // Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20.
- h0 += 19 * q;
- // Goal: Output h-2^255 q, which is between 0 and 2^255-20.
-
- h1 += h0 >> 26; h0 &= kBottom26Bits;
- h2 += h1 >> 25; h1 &= kBottom25Bits;
- h3 += h2 >> 26; h2 &= kBottom26Bits;
- h4 += h3 >> 25; h3 &= kBottom25Bits;
- h5 += h4 >> 26; h4 &= kBottom26Bits;
- h6 += h5 >> 25; h5 &= kBottom25Bits;
- h7 += h6 >> 26; h6 &= kBottom26Bits;
- h8 += h7 >> 25; h7 &= kBottom25Bits;
- h9 += h8 >> 26; h8 &= kBottom26Bits;
- h9 &= kBottom25Bits;
- // h10 = carry9
-
- // Goal: Output h0+...+2^255 h10-2^255 q, which is between 0 and 2^255-20.
- // Have h0+...+2^230 h9 between 0 and 2^255-1;
- // evidently 2^255 h10-2^255 q = 0.
- // Goal: Output h0+...+2^230 h9.
-
- s[0] = h0 >> 0;
- s[1] = h0 >> 8;
- s[2] = h0 >> 16;
- s[3] = (h0 >> 24) | ((uint32_t)(h1) << 2);
- s[4] = h1 >> 6;
- s[5] = h1 >> 14;
- s[6] = (h1 >> 22) | ((uint32_t)(h2) << 3);
- s[7] = h2 >> 5;
- s[8] = h2 >> 13;
- s[9] = (h2 >> 21) | ((uint32_t)(h3) << 5);
- s[10] = h3 >> 3;
- s[11] = h3 >> 11;
- s[12] = (h3 >> 19) | ((uint32_t)(h4) << 6);
- s[13] = h4 >> 2;
- s[14] = h4 >> 10;
- s[15] = h4 >> 18;
- s[16] = h5 >> 0;
- s[17] = h5 >> 8;
- s[18] = h5 >> 16;
- s[19] = (h5 >> 24) | ((uint32_t)(h6) << 1);
- s[20] = h6 >> 7;
- s[21] = h6 >> 15;
- s[22] = (h6 >> 23) | ((uint32_t)(h7) << 3);
- s[23] = h7 >> 5;
- s[24] = h7 >> 13;
- s[25] = (h7 >> 21) | ((uint32_t)(h8) << 4);
- s[26] = h8 >> 4;
- s[27] = h8 >> 12;
- s[28] = (h8 >> 20) | ((uint32_t)(h9) << 6);
- s[29] = h9 >> 2;
- s[30] = h9 >> 10;
- s[31] = h9 >> 18;
+static uint8_t /*bool*/ addcarryx_u25(uint8_t /*bool*/ c, uint32_t a,
+ uint32_t b, uint32_t *low) {
+ // This function extracts 25 bits of result and 1 bit of carry (26 total), so
+ // a 32-bit intermediate is sufficient.
+ uint32_t x = a + b + c;
+ *low = x & ((1 << 25) - 1);
+ return (x >> 25) & 1;
}
-static void fe_tobytes(uint8_t s[32], const fe *h) {
- fe_tobytes_impl(s, h->v);
+static uint8_t /*bool*/ addcarryx_u26(uint8_t /*bool*/ c, uint32_t a,
+ uint32_t b, uint32_t *low) {
+ // This function extracts 26 bits of result and 1 bit of carry (27 total), so
+ // a 32-bit intermediate is sufficient.
+ uint32_t x = a + b + c;
+ *low = x & ((1 << 26) - 1);
+ return (x >> 26) & 1;
}
-static void fe_loose_tobytes(uint8_t s[32], const fe_loose *h) {
- fe_tobytes_impl(s, h->v);
+static uint8_t /*bool*/ subborrow_u25(uint8_t /*bool*/ c, uint32_t a,
+ uint32_t b, uint32_t *low) {
+ // This function extracts 25 bits of result and 1 bit of borrow (26 total), so
+ // a 32-bit intermediate is sufficient.
+ uint32_t x = a - b - c;
+ *low = x & ((1 << 25) - 1);
+ return x >> 31;
+}
+
+static uint8_t /*bool*/ subborrow_u26(uint8_t /*bool*/ c, uint32_t a,
+ uint32_t b, uint32_t *low) {
+ // This function extracts 26 bits of result and 1 bit of borrow (27 total), so
+ // a 32-bit intermediate is sufficient.
+ uint32_t x = a - b - c;
+ *low = x & ((1 << 26) - 1);
+ return x >> 31;
+}
+
+static uint32_t cmovznz32(uint32_t t, uint32_t z, uint32_t nz) {
+ t = -!!t; // all set if nonzero, 0 if 0
+ return (t&nz) | ((~t)&z);
+}
+
+static void fe_freeze(uint32_t out[10], const uint32_t in1[10]) {
+ { const uint32_t x17 = in1[9];
+ { const uint32_t x18 = in1[8];
+ { const uint32_t x16 = in1[7];
+ { const uint32_t x14 = in1[6];
+ { const uint32_t x12 = in1[5];
+ { const uint32_t x10 = in1[4];
+ { const uint32_t x8 = in1[3];
+ { const uint32_t x6 = in1[2];
+ { const uint32_t x4 = in1[1];
+ { const uint32_t x2 = in1[0];
+ { uint32_t x20; uint8_t/*bool*/ x21 = subborrow_u26(0x0, x2, 0x3ffffed, &x20);
+ { uint32_t x23; uint8_t/*bool*/ x24 = subborrow_u25(x21, x4, 0x1ffffff, &x23);
+ { uint32_t x26; uint8_t/*bool*/ x27 = subborrow_u26(x24, x6, 0x3ffffff, &x26);
+ { uint32_t x29; uint8_t/*bool*/ x30 = subborrow_u25(x27, x8, 0x1ffffff, &x29);
+ { uint32_t x32; uint8_t/*bool*/ x33 = subborrow_u26(x30, x10, 0x3ffffff, &x32);
+ { uint32_t x35; uint8_t/*bool*/ x36 = subborrow_u25(x33, x12, 0x1ffffff, &x35);
+ { uint32_t x38; uint8_t/*bool*/ x39 = subborrow_u26(x36, x14, 0x3ffffff, &x38);
+ { uint32_t x41; uint8_t/*bool*/ x42 = subborrow_u25(x39, x16, 0x1ffffff, &x41);
+ { uint32_t x44; uint8_t/*bool*/ x45 = subborrow_u26(x42, x18, 0x3ffffff, &x44);
+ { uint32_t x47; uint8_t/*bool*/ x48 = subborrow_u25(x45, x17, 0x1ffffff, &x47);
+ { uint32_t x49 = cmovznz32(x48, 0x0, 0xffffffff);
+ { uint32_t x50 = (x49 & 0x3ffffed);
+ { uint32_t x52; uint8_t/*bool*/ x53 = addcarryx_u26(0x0, x20, x50, &x52);
+ { uint32_t x54 = (x49 & 0x1ffffff);
+ { uint32_t x56; uint8_t/*bool*/ x57 = addcarryx_u25(x53, x23, x54, &x56);
+ { uint32_t x58 = (x49 & 0x3ffffff);
+ { uint32_t x60; uint8_t/*bool*/ x61 = addcarryx_u26(x57, x26, x58, &x60);
+ { uint32_t x62 = (x49 & 0x1ffffff);
+ { uint32_t x64; uint8_t/*bool*/ x65 = addcarryx_u25(x61, x29, x62, &x64);
+ { uint32_t x66 = (x49 & 0x3ffffff);
+ { uint32_t x68; uint8_t/*bool*/ x69 = addcarryx_u26(x65, x32, x66, &x68);
+ { uint32_t x70 = (x49 & 0x1ffffff);
+ { uint32_t x72; uint8_t/*bool*/ x73 = addcarryx_u25(x69, x35, x70, &x72);
+ { uint32_t x74 = (x49 & 0x3ffffff);
+ { uint32_t x76; uint8_t/*bool*/ x77 = addcarryx_u26(x73, x38, x74, &x76);
+ { uint32_t x78 = (x49 & 0x1ffffff);
+ { uint32_t x80; uint8_t/*bool*/ x81 = addcarryx_u25(x77, x41, x78, &x80);
+ { uint32_t x82 = (x49 & 0x3ffffff);
+ { uint32_t x84; uint8_t/*bool*/ x85 = addcarryx_u26(x81, x44, x82, &x84);
+ { uint32_t x86 = (x49 & 0x1ffffff);
+ { uint32_t x88; addcarryx_u25(x85, x47, x86, &x88);
+ out[0] = x52;
+ out[1] = x56;
+ out[2] = x60;
+ out[3] = x64;
+ out[4] = x68;
+ out[5] = x72;
+ out[6] = x76;
+ out[7] = x80;
+ out[8] = x84;
+ out[9] = x88;
+ }}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}
+}
+
+static void fe_tobytes(uint8_t s[32], const fe *f) {
+ assert_fe(f->v);
+ uint32_t h[10];
+ fe_freeze(h, f->v);
+ assert_fe_frozen(h);
+
+ s[0] = h[0] >> 0;
+ s[1] = h[0] >> 8;
+ s[2] = h[0] >> 16;
+ s[3] = (h[0] >> 24) | (h[1] << 2);
+ s[4] = h[1] >> 6;
+ s[5] = h[1] >> 14;
+ s[6] = (h[1] >> 22) | (h[2] << 3);
+ s[7] = h[2] >> 5;
+ s[8] = h[2] >> 13;
+ s[9] = (h[2] >> 21) | (h[3] << 5);
+ s[10] = h[3] >> 3;
+ s[11] = h[3] >> 11;
+ s[12] = (h[3] >> 19) | (h[4] << 6);
+ s[13] = h[4] >> 2;
+ s[14] = h[4] >> 10;
+ s[15] = h[4] >> 18;
+ s[16] = h[5] >> 0;
+ s[17] = h[5] >> 8;
+ s[18] = h[5] >> 16;
+ s[19] = (h[5] >> 24) | (h[6] << 1);
+ s[20] = h[6] >> 7;
+ s[21] = h[6] >> 15;
+ s[22] = (h[6] >> 23) | (h[7] << 3);
+ s[23] = h[7] >> 5;
+ s[24] = h[7] >> 13;
+ s[25] = (h[7] >> 21) | (h[8] << 4);
+ s[26] = h[8] >> 4;
+ s[27] = h[8] >> 12;
+ s[28] = (h[8] >> 20) | (h[9] << 6);
+ s[29] = h[9] >> 2;
+ s[30] = h[9] >> 10;
+ s[31] = h[9] >> 18;
}
// h = f
@@ -775,8 +803,10 @@
// return 0 if f == 0
// return 1 if f != 0
static int fe_isnonzero(const fe_loose *f) {
+ fe tight;
+ fe_carry(&tight, f);
uint8_t s[32];
- fe_loose_tobytes(s, f);
+ fe_tobytes(s, &tight);
static const uint8_t zero[32] = {0};
return CRYPTO_memcmp(s, zero, sizeof(zero)) != 0;