Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1 | /* Copyright (c) 2018, Google Inc. |
| 2 | * |
| 3 | * Permission to use, copy, modify, and/or distribute this software for any |
| 4 | * purpose with or without fee is hereby granted, provided that the above |
| 5 | * copyright notice and this permission notice appear in all copies. |
| 6 | * |
| 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
| 8 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
| 9 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY |
| 10 | * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
| 11 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION |
| 12 | * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN |
| 13 | * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ |
| 14 | |
| 15 | #include <openssl/hrss.h> |
| 16 | |
| 17 | #include <assert.h> |
| 18 | #include <stdio.h> |
| 19 | #include <stdlib.h> |
| 20 | |
| 21 | #include <openssl/bn.h> |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 22 | #include <openssl/hmac.h> |
| 23 | #include <openssl/mem.h> |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 24 | #include <openssl/rand.h> |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 25 | #include <openssl/sha.h> |
| 26 | |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 27 | #if defined(_MSC_VER) |
| 28 | #define RESTRICT |
| 29 | #else |
| 30 | #define RESTRICT restrict |
| 31 | #endif |
| 32 | |
| 33 | #include "../internal.h" |
| 34 | #include "internal.h" |
| 35 | |
David Benjamin | 6887d5e | 2019-12-17 13:56:30 -0500 | [diff] [blame] | 36 | #if defined(OPENSSL_SSE2) |
| 37 | #include <emmintrin.h> |
| 38 | #endif |
| 39 | |
David Benjamin | 846a227 | 2022-01-06 11:12:06 -0500 | [diff] [blame] | 40 | #if (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && defined(__ARM_NEON) |
David Benjamin | 6887d5e | 2019-12-17 13:56:30 -0500 | [diff] [blame] | 41 | #include <arm_neon.h> |
| 42 | #endif |
| 43 | |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 44 | // This is an implementation of [HRSS], but with a KEM transformation based on |
| 45 | // [SXY]. The primary references are: |
| 46 | |
| 47 | // HRSS: https://eprint.iacr.org/2017/667.pdf |
| 48 | // HRSSNIST: |
| 49 | // https://csrc.nist.gov/CSRC/media/Projects/Post-Quantum-Cryptography/documents/round-1/submissions/NTRU_HRSS_KEM.zip |
| 50 | // SXY: https://eprint.iacr.org/2017/1005.pdf |
| 51 | // NTRUTN14: |
| 52 | // https://assets.onboardsecurity.com/static/downloads/NTRU/resources/NTRUTech014.pdf |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 53 | // NTRUCOMP: https://eprint.iacr.org/2018/1174 |
| 54 | // SAFEGCD: https://gcd.cr.yp.to/papers.html#safegcd |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 55 | |
| 56 | |
| 57 | // Vector operations. |
| 58 | // |
| 59 | // A couple of functions in this file can use vector operations to meaningful |
| 60 | // effect. If we're building for a target that has a supported vector unit, |
| 61 | // |HRSS_HAVE_VECTOR_UNIT| will be defined and |vec_t| will be typedefed to a |
| 62 | // 128-bit vector. The following functions abstract over the differences between |
| 63 | // NEON and SSE2 for implementing some vector operations. |
| 64 | |
David Benjamin | 6887d5e | 2019-12-17 13:56:30 -0500 | [diff] [blame] | 65 | // TODO: MSVC can likely also be made to work with vector operations, but ^ must |
| 66 | // be replaced with _mm_xor_si128, etc. |
| 67 | #if defined(OPENSSL_SSE2) && (defined(__clang__) || !defined(_MSC_VER)) |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 68 | |
| 69 | #define HRSS_HAVE_VECTOR_UNIT |
| 70 | typedef __m128i vec_t; |
| 71 | |
| 72 | // vec_capable returns one iff the current platform supports SSE2. |
David Benjamin | 6887d5e | 2019-12-17 13:56:30 -0500 | [diff] [blame] | 73 | static int vec_capable(void) { return 1; } |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 74 | |
| 75 | // vec_add performs a pair-wise addition of four uint16s from |a| and |b|. |
| 76 | static inline vec_t vec_add(vec_t a, vec_t b) { return _mm_add_epi16(a, b); } |
| 77 | |
| 78 | // vec_sub performs a pair-wise subtraction of four uint16s from |a| and |b|. |
| 79 | static inline vec_t vec_sub(vec_t a, vec_t b) { return _mm_sub_epi16(a, b); } |
| 80 | |
| 81 | // vec_mul multiplies each uint16_t in |a| by |b| and returns the resulting |
| 82 | // vector. |
| 83 | static inline vec_t vec_mul(vec_t a, uint16_t b) { |
| 84 | return _mm_mullo_epi16(a, _mm_set1_epi16(b)); |
| 85 | } |
| 86 | |
| 87 | // vec_fma multiplies each uint16_t in |b| by |c|, adds the result to |a|, and |
| 88 | // returns the resulting vector. |
| 89 | static inline vec_t vec_fma(vec_t a, vec_t b, uint16_t c) { |
| 90 | return _mm_add_epi16(a, _mm_mullo_epi16(b, _mm_set1_epi16(c))); |
| 91 | } |
| 92 | |
| 93 | // vec3_rshift_word right-shifts the 24 uint16_t's in |v| by one uint16. |
| 94 | static inline void vec3_rshift_word(vec_t v[3]) { |
| 95 | // Intel's left and right shifting is backwards compared to the order in |
| 96 | // memory because they're based on little-endian order of words (and not just |
| 97 | // bytes). So the shifts in this function will be backwards from what one |
| 98 | // might expect. |
| 99 | const __m128i carry0 = _mm_srli_si128(v[0], 14); |
| 100 | v[0] = _mm_slli_si128(v[0], 2); |
| 101 | |
| 102 | const __m128i carry1 = _mm_srli_si128(v[1], 14); |
| 103 | v[1] = _mm_slli_si128(v[1], 2); |
| 104 | v[1] |= carry0; |
| 105 | |
| 106 | v[2] = _mm_slli_si128(v[2], 2); |
| 107 | v[2] |= carry1; |
| 108 | } |
| 109 | |
| 110 | // vec4_rshift_word right-shifts the 32 uint16_t's in |v| by one uint16. |
| 111 | static inline void vec4_rshift_word(vec_t v[4]) { |
| 112 | // Intel's left and right shifting is backwards compared to the order in |
| 113 | // memory because they're based on little-endian order of words (and not just |
| 114 | // bytes). So the shifts in this function will be backwards from what one |
| 115 | // might expect. |
| 116 | const __m128i carry0 = _mm_srli_si128(v[0], 14); |
| 117 | v[0] = _mm_slli_si128(v[0], 2); |
| 118 | |
| 119 | const __m128i carry1 = _mm_srli_si128(v[1], 14); |
| 120 | v[1] = _mm_slli_si128(v[1], 2); |
| 121 | v[1] |= carry0; |
| 122 | |
| 123 | const __m128i carry2 = _mm_srli_si128(v[2], 14); |
| 124 | v[2] = _mm_slli_si128(v[2], 2); |
| 125 | v[2] |= carry1; |
| 126 | |
| 127 | v[3] = _mm_slli_si128(v[3], 2); |
| 128 | v[3] |= carry2; |
| 129 | } |
| 130 | |
| 131 | // vec_merge_3_5 takes the final three uint16_t's from |left|, appends the first |
| 132 | // five from |right|, and returns the resulting vector. |
| 133 | static inline vec_t vec_merge_3_5(vec_t left, vec_t right) { |
| 134 | return _mm_srli_si128(left, 10) | _mm_slli_si128(right, 6); |
| 135 | } |
| 136 | |
| 137 | // poly3_vec_lshift1 left-shifts the 768 bits in |a_s|, and in |a_a|, by one |
| 138 | // bit. |
| 139 | static inline void poly3_vec_lshift1(vec_t a_s[6], vec_t a_a[6]) { |
| 140 | vec_t carry_s = {0}; |
| 141 | vec_t carry_a = {0}; |
| 142 | |
| 143 | for (int i = 0; i < 6; i++) { |
| 144 | vec_t next_carry_s = _mm_srli_epi64(a_s[i], 63); |
| 145 | a_s[i] = _mm_slli_epi64(a_s[i], 1); |
| 146 | a_s[i] |= _mm_slli_si128(next_carry_s, 8); |
| 147 | a_s[i] |= carry_s; |
| 148 | carry_s = _mm_srli_si128(next_carry_s, 8); |
| 149 | |
| 150 | vec_t next_carry_a = _mm_srli_epi64(a_a[i], 63); |
| 151 | a_a[i] = _mm_slli_epi64(a_a[i], 1); |
| 152 | a_a[i] |= _mm_slli_si128(next_carry_a, 8); |
| 153 | a_a[i] |= carry_a; |
| 154 | carry_a = _mm_srli_si128(next_carry_a, 8); |
| 155 | } |
| 156 | } |
| 157 | |
| 158 | // poly3_vec_rshift1 right-shifts the 768 bits in |a_s|, and in |a_a|, by one |
| 159 | // bit. |
| 160 | static inline void poly3_vec_rshift1(vec_t a_s[6], vec_t a_a[6]) { |
| 161 | vec_t carry_s = {0}; |
| 162 | vec_t carry_a = {0}; |
| 163 | |
| 164 | for (int i = 5; i >= 0; i--) { |
| 165 | const vec_t next_carry_s = _mm_slli_epi64(a_s[i], 63); |
| 166 | a_s[i] = _mm_srli_epi64(a_s[i], 1); |
| 167 | a_s[i] |= _mm_srli_si128(next_carry_s, 8); |
| 168 | a_s[i] |= carry_s; |
| 169 | carry_s = _mm_slli_si128(next_carry_s, 8); |
| 170 | |
| 171 | const vec_t next_carry_a = _mm_slli_epi64(a_a[i], 63); |
| 172 | a_a[i] = _mm_srli_epi64(a_a[i], 1); |
| 173 | a_a[i] |= _mm_srli_si128(next_carry_a, 8); |
| 174 | a_a[i] |= carry_a; |
| 175 | carry_a = _mm_slli_si128(next_carry_a, 8); |
| 176 | } |
| 177 | } |
| 178 | |
| 179 | // vec_broadcast_bit duplicates the least-significant bit in |a| to all bits in |
| 180 | // a vector and returns the result. |
| 181 | static inline vec_t vec_broadcast_bit(vec_t a) { |
| 182 | return _mm_shuffle_epi32(_mm_srai_epi32(_mm_slli_epi64(a, 63), 31), |
| 183 | 0b01010101); |
| 184 | } |
| 185 | |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 186 | // vec_get_word returns the |i|th uint16_t in |v|. (This is a macro because the |
| 187 | // compiler requires that |i| be a compile-time constant.) |
| 188 | #define vec_get_word(v, i) _mm_extract_epi16(v, i) |
| 189 | |
David Benjamin | 846a227 | 2022-01-06 11:12:06 -0500 | [diff] [blame] | 190 | #elif (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && defined(__ARM_NEON) |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 191 | |
| 192 | #define HRSS_HAVE_VECTOR_UNIT |
| 193 | typedef uint16x8_t vec_t; |
| 194 | |
| 195 | // These functions perform the same actions as the SSE2 function of the same |
| 196 | // name, above. |
| 197 | |
Alessandro Ghedini | 9b0970f | 2018-12-14 16:43:36 +0000 | [diff] [blame] | 198 | static int vec_capable(void) { return CRYPTO_is_NEON_capable(); } |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 199 | |
| 200 | static inline vec_t vec_add(vec_t a, vec_t b) { return a + b; } |
| 201 | |
| 202 | static inline vec_t vec_sub(vec_t a, vec_t b) { return a - b; } |
| 203 | |
| 204 | static inline vec_t vec_mul(vec_t a, uint16_t b) { return vmulq_n_u16(a, b); } |
| 205 | |
| 206 | static inline vec_t vec_fma(vec_t a, vec_t b, uint16_t c) { |
| 207 | return vmlaq_n_u16(a, b, c); |
| 208 | } |
| 209 | |
| 210 | static inline void vec3_rshift_word(vec_t v[3]) { |
| 211 | const uint16x8_t kZero = {0}; |
| 212 | v[2] = vextq_u16(v[1], v[2], 7); |
| 213 | v[1] = vextq_u16(v[0], v[1], 7); |
| 214 | v[0] = vextq_u16(kZero, v[0], 7); |
| 215 | } |
| 216 | |
| 217 | static inline void vec4_rshift_word(vec_t v[4]) { |
| 218 | const uint16x8_t kZero = {0}; |
| 219 | v[3] = vextq_u16(v[2], v[3], 7); |
| 220 | v[2] = vextq_u16(v[1], v[2], 7); |
| 221 | v[1] = vextq_u16(v[0], v[1], 7); |
| 222 | v[0] = vextq_u16(kZero, v[0], 7); |
| 223 | } |
| 224 | |
| 225 | static inline vec_t vec_merge_3_5(vec_t left, vec_t right) { |
| 226 | return vextq_u16(left, right, 5); |
| 227 | } |
| 228 | |
| 229 | static inline uint16_t vec_get_word(vec_t v, unsigned i) { |
| 230 | return v[i]; |
| 231 | } |
| 232 | |
| 233 | #if !defined(OPENSSL_AARCH64) |
| 234 | |
| 235 | static inline vec_t vec_broadcast_bit(vec_t a) { |
| 236 | a = (vec_t)vshrq_n_s16(((int16x8_t)a) << 15, 15); |
| 237 | return vdupq_lane_u16(vget_low_u16(a), 0); |
| 238 | } |
| 239 | |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 240 | static inline void poly3_vec_lshift1(vec_t a_s[6], vec_t a_a[6]) { |
| 241 | vec_t carry_s = {0}; |
| 242 | vec_t carry_a = {0}; |
| 243 | const vec_t kZero = {0}; |
| 244 | |
| 245 | for (int i = 0; i < 6; i++) { |
| 246 | vec_t next_carry_s = a_s[i] >> 15; |
| 247 | a_s[i] <<= 1; |
| 248 | a_s[i] |= vextq_u16(kZero, next_carry_s, 7); |
| 249 | a_s[i] |= carry_s; |
| 250 | carry_s = vextq_u16(next_carry_s, kZero, 7); |
| 251 | |
| 252 | vec_t next_carry_a = a_a[i] >> 15; |
| 253 | a_a[i] <<= 1; |
| 254 | a_a[i] |= vextq_u16(kZero, next_carry_a, 7); |
| 255 | a_a[i] |= carry_a; |
| 256 | carry_a = vextq_u16(next_carry_a, kZero, 7); |
| 257 | } |
| 258 | } |
| 259 | |
| 260 | static inline void poly3_vec_rshift1(vec_t a_s[6], vec_t a_a[6]) { |
| 261 | vec_t carry_s = {0}; |
| 262 | vec_t carry_a = {0}; |
| 263 | const vec_t kZero = {0}; |
| 264 | |
| 265 | for (int i = 5; i >= 0; i--) { |
| 266 | vec_t next_carry_s = a_s[i] << 15; |
| 267 | a_s[i] >>= 1; |
| 268 | a_s[i] |= vextq_u16(next_carry_s, kZero, 1); |
| 269 | a_s[i] |= carry_s; |
| 270 | carry_s = vextq_u16(kZero, next_carry_s, 1); |
| 271 | |
| 272 | vec_t next_carry_a = a_a[i] << 15; |
| 273 | a_a[i] >>= 1; |
| 274 | a_a[i] |= vextq_u16(next_carry_a, kZero, 1); |
| 275 | a_a[i] |= carry_a; |
| 276 | carry_a = vextq_u16(kZero, next_carry_a, 1); |
| 277 | } |
| 278 | } |
| 279 | |
| 280 | #endif // !OPENSSL_AARCH64 |
| 281 | |
| 282 | #endif // (ARM || AARCH64) && NEON |
| 283 | |
| 284 | // Polynomials in this scheme have N terms. |
| 285 | // #define N 701 |
| 286 | |
| 287 | // Underlying data types and arithmetic operations. |
| 288 | // ------------------------------------------------ |
| 289 | |
| 290 | // Binary polynomials. |
| 291 | |
| 292 | // poly2 represents a degree-N polynomial over GF(2). The words are in little- |
| 293 | // endian order, i.e. the coefficient of x^0 is the LSB of the first word. The |
| 294 | // final word is only partially used since N is not a multiple of the word size. |
| 295 | |
| 296 | // Defined in internal.h: |
| 297 | // struct poly2 { |
| 298 | // crypto_word_t v[WORDS_PER_POLY]; |
| 299 | // }; |
| 300 | |
| 301 | OPENSSL_UNUSED static void hexdump(const void *void_in, size_t len) { |
| 302 | const uint8_t *in = (const uint8_t *)void_in; |
| 303 | for (size_t i = 0; i < len; i++) { |
| 304 | printf("%02x", in[i]); |
| 305 | } |
| 306 | printf("\n"); |
| 307 | } |
| 308 | |
| 309 | static void poly2_zero(struct poly2 *p) { |
| 310 | OPENSSL_memset(&p->v[0], 0, sizeof(crypto_word_t) * WORDS_PER_POLY); |
| 311 | } |
| 312 | |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 313 | // word_reverse returns |in| with the bits in reverse order. |
| 314 | static crypto_word_t word_reverse(crypto_word_t in) { |
| 315 | #if defined(OPENSSL_64_BIT) |
| 316 | static const crypto_word_t kMasks[6] = { |
| 317 | UINT64_C(0x5555555555555555), |
| 318 | UINT64_C(0x3333333333333333), |
| 319 | UINT64_C(0x0f0f0f0f0f0f0f0f), |
| 320 | UINT64_C(0x00ff00ff00ff00ff), |
| 321 | UINT64_C(0x0000ffff0000ffff), |
| 322 | UINT64_C(0x00000000ffffffff), |
| 323 | }; |
| 324 | #else |
| 325 | static const crypto_word_t kMasks[5] = { |
| 326 | 0x55555555, |
| 327 | 0x33333333, |
| 328 | 0x0f0f0f0f, |
| 329 | 0x00ff00ff, |
| 330 | 0x0000ffff, |
| 331 | }; |
| 332 | #endif |
| 333 | |
| 334 | for (size_t i = 0; i < OPENSSL_ARRAY_SIZE(kMasks); i++) { |
| 335 | in = ((in >> (1 << i)) & kMasks[i]) | ((in & kMasks[i]) << (1 << i)); |
| 336 | } |
| 337 | |
| 338 | return in; |
| 339 | } |
| 340 | |
| 341 | // lsb_to_all replicates the least-significant bit of |v| to all bits of the |
| 342 | // word. This is used in bit-slicing operations to make a vector from a fixed |
| 343 | // value. |
| 344 | static crypto_word_t lsb_to_all(crypto_word_t v) { return 0u - (v & 1); } |
| 345 | |
| 346 | // poly2_mod_phiN reduces |p| by Φ(N). |
| 347 | static void poly2_mod_phiN(struct poly2 *p) { |
| 348 | // m is the term at x^700, replicated to every bit. |
| 349 | const crypto_word_t m = |
| 350 | lsb_to_all(p->v[WORDS_PER_POLY - 1] >> (BITS_IN_LAST_WORD - 1)); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 351 | for (size_t i = 0; i < WORDS_PER_POLY; i++) { |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 352 | p->v[i] ^= m; |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 353 | } |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 354 | p->v[WORDS_PER_POLY - 1] &= (UINT64_C(1) << (BITS_IN_LAST_WORD - 1)) - 1; |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 355 | } |
| 356 | |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 357 | // poly2_reverse_700 reverses the order of the first 700 bits of |in| and writes |
| 358 | // the result to |out|. |
| 359 | static void poly2_reverse_700(struct poly2 *out, const struct poly2 *in) { |
| 360 | struct poly2 t; |
| 361 | for (size_t i = 0; i < WORDS_PER_POLY; i++) { |
| 362 | t.v[i] = word_reverse(in->v[i]); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 363 | } |
| 364 | |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 365 | static const size_t shift = BITS_PER_WORD - ((N-1) % BITS_PER_WORD); |
| 366 | for (size_t i = 0; i < WORDS_PER_POLY-1; i++) { |
| 367 | out->v[i] = t.v[WORDS_PER_POLY-1-i] >> shift; |
| 368 | out->v[i] |= t.v[WORDS_PER_POLY-2-i] << (BITS_PER_WORD - shift); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 369 | } |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 370 | out->v[WORDS_PER_POLY-1] = t.v[0] >> shift; |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 371 | } |
| 372 | |
| 373 | // poly2_cswap exchanges the values of |a| and |b| if |swap| is all ones. |
| 374 | static void poly2_cswap(struct poly2 *a, struct poly2 *b, crypto_word_t swap) { |
| 375 | for (size_t i = 0; i < WORDS_PER_POLY; i++) { |
| 376 | const crypto_word_t sum = swap & (a->v[i] ^ b->v[i]); |
| 377 | a->v[i] ^= sum; |
| 378 | b->v[i] ^= sum; |
| 379 | } |
| 380 | } |
| 381 | |
| 382 | // poly2_fmadd sets |out| to |out| + |in| * m, where m is either |
| 383 | // |CONSTTIME_TRUE_W| or |CONSTTIME_FALSE_W|. |
| 384 | static void poly2_fmadd(struct poly2 *out, const struct poly2 *in, |
| 385 | crypto_word_t m) { |
| 386 | for (size_t i = 0; i < WORDS_PER_POLY; i++) { |
| 387 | out->v[i] ^= in->v[i] & m; |
| 388 | } |
| 389 | } |
| 390 | |
| 391 | // poly2_lshift1 left-shifts |p| by one bit. |
| 392 | static void poly2_lshift1(struct poly2 *p) { |
| 393 | crypto_word_t carry = 0; |
| 394 | for (size_t i = 0; i < WORDS_PER_POLY; i++) { |
| 395 | const crypto_word_t next_carry = p->v[i] >> (BITS_PER_WORD - 1); |
| 396 | p->v[i] <<= 1; |
| 397 | p->v[i] |= carry; |
| 398 | carry = next_carry; |
| 399 | } |
| 400 | } |
| 401 | |
| 402 | // poly2_rshift1 right-shifts |p| by one bit. |
| 403 | static void poly2_rshift1(struct poly2 *p) { |
| 404 | crypto_word_t carry = 0; |
| 405 | for (size_t i = WORDS_PER_POLY - 1; i < WORDS_PER_POLY; i--) { |
| 406 | const crypto_word_t next_carry = p->v[i] & 1; |
| 407 | p->v[i] >>= 1; |
| 408 | p->v[i] |= carry << (BITS_PER_WORD - 1); |
| 409 | carry = next_carry; |
| 410 | } |
| 411 | } |
| 412 | |
| 413 | // poly2_clear_top_bits clears the bits in the final word that are only for |
| 414 | // alignment. |
| 415 | static void poly2_clear_top_bits(struct poly2 *p) { |
| 416 | p->v[WORDS_PER_POLY - 1] &= (UINT64_C(1) << BITS_IN_LAST_WORD) - 1; |
| 417 | } |
| 418 | |
| 419 | // poly2_top_bits_are_clear returns one iff the extra bits in the final words of |
| 420 | // |p| are zero. |
| 421 | static int poly2_top_bits_are_clear(const struct poly2 *p) { |
| 422 | return (p->v[WORDS_PER_POLY - 1] & |
| 423 | ~((UINT64_C(1) << BITS_IN_LAST_WORD) - 1)) == 0; |
| 424 | } |
| 425 | |
| 426 | // Ternary polynomials. |
| 427 | |
| 428 | // poly3 represents a degree-N polynomial over GF(3). Each coefficient is |
| 429 | // bitsliced across the |s| and |a| arrays, like this: |
| 430 | // |
| 431 | // s | a | value |
| 432 | // ----------------- |
| 433 | // 0 | 0 | 0 |
| 434 | // 0 | 1 | 1 |
Adam Langley | eadef47 | 2019-01-16 17:20:24 -0800 | [diff] [blame] | 435 | // 1 | 1 | -1 (aka 2) |
| 436 | // 1 | 0 | <invalid> |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 437 | // |
Adam Langley | eadef47 | 2019-01-16 17:20:24 -0800 | [diff] [blame] | 438 | // ('s' is for sign, and 'a' is the absolute value.) |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 439 | // |
| 440 | // Once bitsliced as such, the following circuits can be used to implement |
| 441 | // addition and multiplication mod 3: |
| 442 | // |
| 443 | // (s3, a3) = (s1, a1) × (s2, a2) |
Adam Langley | eadef47 | 2019-01-16 17:20:24 -0800 | [diff] [blame] | 444 | // a3 = a1 ∧ a2 |
| 445 | // s3 = (s1 ⊕ s2) ∧ a3 |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 446 | // |
| 447 | // (s3, a3) = (s1, a1) + (s2, a2) |
Adam Langley | eadef47 | 2019-01-16 17:20:24 -0800 | [diff] [blame] | 448 | // t = s1 ⊕ a2 |
| 449 | // s3 = t ∧ (s2 ⊕ a1) |
| 450 | // a3 = (a1 ⊕ a2) ∨ (t ⊕ s2) |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 451 | // |
Adam Langley | eadef47 | 2019-01-16 17:20:24 -0800 | [diff] [blame] | 452 | // (s3, a3) = (s1, a1) - (s2, a2) |
| 453 | // t = a1 ⊕ a2 |
| 454 | // s3 = (s1 ⊕ a2) ∧ (t ⊕ s2) |
| 455 | // a3 = t ∨ (s1 ⊕ s2) |
| 456 | // |
| 457 | // Negating a value just involves XORing s by a. |
| 458 | // |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 459 | // struct poly3 { |
| 460 | // struct poly2 s, a; |
| 461 | // }; |
| 462 | |
| 463 | OPENSSL_UNUSED static void poly3_print(const struct poly3 *in) { |
| 464 | struct poly3 p; |
| 465 | OPENSSL_memcpy(&p, in, sizeof(p)); |
| 466 | p.s.v[WORDS_PER_POLY - 1] &= ((crypto_word_t)1 << BITS_IN_LAST_WORD) - 1; |
| 467 | p.a.v[WORDS_PER_POLY - 1] &= ((crypto_word_t)1 << BITS_IN_LAST_WORD) - 1; |
| 468 | |
| 469 | printf("{["); |
| 470 | for (unsigned i = 0; i < WORDS_PER_POLY; i++) { |
| 471 | if (i) { |
| 472 | printf(" "); |
| 473 | } |
| 474 | printf(BN_HEX_FMT2, p.s.v[i]); |
| 475 | } |
| 476 | printf("] ["); |
| 477 | for (unsigned i = 0; i < WORDS_PER_POLY; i++) { |
| 478 | if (i) { |
| 479 | printf(" "); |
| 480 | } |
| 481 | printf(BN_HEX_FMT2, p.a.v[i]); |
| 482 | } |
| 483 | printf("]}\n"); |
| 484 | } |
| 485 | |
| 486 | static void poly3_zero(struct poly3 *p) { |
| 487 | poly2_zero(&p->s); |
| 488 | poly2_zero(&p->a); |
| 489 | } |
| 490 | |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 491 | // poly3_reverse_700 reverses the order of the first 700 terms of |in| and |
| 492 | // writes them to |out|. |
| 493 | static void poly3_reverse_700(struct poly3 *out, const struct poly3 *in) { |
| 494 | poly2_reverse_700(&out->a, &in->a); |
| 495 | poly2_reverse_700(&out->s, &in->s); |
| 496 | } |
| 497 | |
| 498 | // poly3_word_mul sets (|out_s|, |out_a|) to (|s1|, |a1|) × (|s2|, |a2|). |
Adam Langley | eadef47 | 2019-01-16 17:20:24 -0800 | [diff] [blame] | 499 | static void poly3_word_mul(crypto_word_t *out_s, crypto_word_t *out_a, |
| 500 | const crypto_word_t s1, const crypto_word_t a1, |
| 501 | const crypto_word_t s2, const crypto_word_t a2) { |
| 502 | *out_a = a1 & a2; |
| 503 | *out_s = (s1 ^ s2) & *out_a; |
| 504 | } |
| 505 | |
| 506 | // poly3_word_add sets (|out_s|, |out_a|) to (|s1|, |a1|) + (|s2|, |a2|). |
| 507 | static void poly3_word_add(crypto_word_t *out_s, crypto_word_t *out_a, |
| 508 | const crypto_word_t s1, const crypto_word_t a1, |
| 509 | const crypto_word_t s2, const crypto_word_t a2) { |
| 510 | const crypto_word_t t = s1 ^ a2; |
| 511 | *out_s = t & (s2 ^ a1); |
| 512 | *out_a = (a1 ^ a2) | (t ^ s2); |
| 513 | } |
| 514 | |
| 515 | // poly3_word_sub sets (|out_s|, |out_a|) to (|s1|, |a1|) - (|s2|, |a2|). |
| 516 | static void poly3_word_sub(crypto_word_t *out_s, crypto_word_t *out_a, |
| 517 | const crypto_word_t s1, const crypto_word_t a1, |
| 518 | const crypto_word_t s2, const crypto_word_t a2) { |
| 519 | const crypto_word_t t = a1 ^ a2; |
| 520 | *out_s = (s1 ^ a2) & (t ^ s2); |
| 521 | *out_a = t | (s1 ^ s2); |
| 522 | } |
| 523 | |
Adam Langley | eadef47 | 2019-01-16 17:20:24 -0800 | [diff] [blame] | 524 | // poly3_mul_const sets |p| to |p|×m, where m = (ms, ma). |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 525 | static void poly3_mul_const(struct poly3 *p, crypto_word_t ms, |
| 526 | crypto_word_t ma) { |
| 527 | ms = lsb_to_all(ms); |
| 528 | ma = lsb_to_all(ma); |
| 529 | |
| 530 | for (size_t i = 0; i < WORDS_PER_POLY; i++) { |
Adam Langley | eadef47 | 2019-01-16 17:20:24 -0800 | [diff] [blame] | 531 | poly3_word_mul(&p->s.v[i], &p->a.v[i], p->s.v[i], p->a.v[i], ms, ma); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 532 | } |
| 533 | } |
| 534 | |
Adam Langley | eadef47 | 2019-01-16 17:20:24 -0800 | [diff] [blame] | 535 | // poly3_fmadd sets |out| to |out| - |in|×m, where m is (ms, ma). |
| 536 | static void poly3_fmsub(struct poly3 *RESTRICT out, |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 537 | const struct poly3 *RESTRICT in, crypto_word_t ms, |
| 538 | crypto_word_t ma) { |
Adam Langley | eadef47 | 2019-01-16 17:20:24 -0800 | [diff] [blame] | 539 | crypto_word_t product_s, product_a; |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 540 | for (size_t i = 0; i < WORDS_PER_POLY; i++) { |
Adam Langley | eadef47 | 2019-01-16 17:20:24 -0800 | [diff] [blame] | 541 | poly3_word_mul(&product_s, &product_a, in->s.v[i], in->a.v[i], ms, ma); |
| 542 | poly3_word_sub(&out->s.v[i], &out->a.v[i], out->s.v[i], out->a.v[i], |
| 543 | product_s, product_a); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 544 | } |
| 545 | } |
| 546 | |
| 547 | // final_bit_to_all replicates the bit in the final position of the last word to |
| 548 | // all the bits in the word. |
| 549 | static crypto_word_t final_bit_to_all(crypto_word_t v) { |
| 550 | return lsb_to_all(v >> (BITS_IN_LAST_WORD - 1)); |
| 551 | } |
| 552 | |
| 553 | // poly3_top_bits_are_clear returns one iff the extra bits in the final words of |
| 554 | // |p| are zero. |
| 555 | OPENSSL_UNUSED static int poly3_top_bits_are_clear(const struct poly3 *p) { |
| 556 | return poly2_top_bits_are_clear(&p->s) && poly2_top_bits_are_clear(&p->a); |
| 557 | } |
| 558 | |
| 559 | // poly3_mod_phiN reduces |p| by Φ(N). |
| 560 | static void poly3_mod_phiN(struct poly3 *p) { |
| 561 | // In order to reduce by Φ(N) we subtract by the value of the greatest |
Adam Langley | eadef47 | 2019-01-16 17:20:24 -0800 | [diff] [blame] | 562 | // coefficient. |
| 563 | const crypto_word_t factor_s = final_bit_to_all(p->s.v[WORDS_PER_POLY - 1]); |
| 564 | const crypto_word_t factor_a = final_bit_to_all(p->a.v[WORDS_PER_POLY - 1]); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 565 | |
| 566 | for (size_t i = 0; i < WORDS_PER_POLY; i++) { |
Adam Langley | eadef47 | 2019-01-16 17:20:24 -0800 | [diff] [blame] | 567 | poly3_word_sub(&p->s.v[i], &p->a.v[i], p->s.v[i], p->a.v[i], factor_s, |
| 568 | factor_a); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 569 | } |
| 570 | |
| 571 | poly2_clear_top_bits(&p->s); |
| 572 | poly2_clear_top_bits(&p->a); |
| 573 | } |
| 574 | |
| 575 | static void poly3_cswap(struct poly3 *a, struct poly3 *b, crypto_word_t swap) { |
| 576 | poly2_cswap(&a->s, &b->s, swap); |
| 577 | poly2_cswap(&a->a, &b->a, swap); |
| 578 | } |
| 579 | |
| 580 | static void poly3_lshift1(struct poly3 *p) { |
| 581 | poly2_lshift1(&p->s); |
| 582 | poly2_lshift1(&p->a); |
| 583 | } |
| 584 | |
| 585 | static void poly3_rshift1(struct poly3 *p) { |
| 586 | poly2_rshift1(&p->s); |
| 587 | poly2_rshift1(&p->a); |
| 588 | } |
| 589 | |
| 590 | // poly3_span represents a pointer into a poly3. |
| 591 | struct poly3_span { |
| 592 | crypto_word_t *s; |
| 593 | crypto_word_t *a; |
| 594 | }; |
| 595 | |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 596 | // poly3_span_add adds |n| words of values from |a| and |b| and writes the |
| 597 | // result to |out|. |
| 598 | static void poly3_span_add(const struct poly3_span *out, |
| 599 | const struct poly3_span *a, |
| 600 | const struct poly3_span *b, size_t n) { |
| 601 | for (size_t i = 0; i < n; i++) { |
| 602 | poly3_word_add(&out->s[i], &out->a[i], a->s[i], a->a[i], b->s[i], b->a[i]); |
| 603 | } |
| 604 | } |
| 605 | |
| 606 | // poly3_span_sub subtracts |n| words of |b| from |n| words of |a|. |
| 607 | static void poly3_span_sub(const struct poly3_span *a, |
| 608 | const struct poly3_span *b, size_t n) { |
| 609 | for (size_t i = 0; i < n; i++) { |
Adam Langley | eadef47 | 2019-01-16 17:20:24 -0800 | [diff] [blame] | 610 | poly3_word_sub(&a->s[i], &a->a[i], a->s[i], a->a[i], b->s[i], b->a[i]); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 611 | } |
| 612 | } |
| 613 | |
| 614 | // poly3_mul_aux is a recursive function that multiplies |n| words from |a| and |
| 615 | // |b| and writes 2×|n| words to |out|. Each call uses 2*ceil(n/2) elements of |
| 616 | // |scratch| and the function recurses, except if |n| == 1, when |scratch| isn't |
| 617 | // used and the recursion stops. For |n| in {11, 22}, the transitive total |
| 618 | // amount of |scratch| needed happens to be 2n+2. |
| 619 | static void poly3_mul_aux(const struct poly3_span *out, |
| 620 | const struct poly3_span *scratch, |
| 621 | const struct poly3_span *a, |
| 622 | const struct poly3_span *b, size_t n) { |
| 623 | if (n == 1) { |
| 624 | crypto_word_t r_s_low = 0, r_s_high = 0, r_a_low = 0, r_a_high = 0; |
| 625 | crypto_word_t b_s = b->s[0], b_a = b->a[0]; |
| 626 | const crypto_word_t a_s = a->s[0], a_a = a->a[0]; |
| 627 | |
| 628 | for (size_t i = 0; i < BITS_PER_WORD; i++) { |
| 629 | // Multiply (s, a) by the next value from (b_s, b_a). |
Adam Langley | eadef47 | 2019-01-16 17:20:24 -0800 | [diff] [blame] | 630 | crypto_word_t m_s, m_a; |
| 631 | poly3_word_mul(&m_s, &m_a, a_s, a_a, lsb_to_all(b_s), lsb_to_all(b_a)); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 632 | b_s >>= 1; |
| 633 | b_a >>= 1; |
| 634 | |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 635 | if (i == 0) { |
| 636 | // Special case otherwise the code tries to shift by BITS_PER_WORD |
| 637 | // below, which is undefined. |
| 638 | r_s_low = m_s; |
| 639 | r_a_low = m_a; |
| 640 | continue; |
| 641 | } |
| 642 | |
| 643 | // Shift the multiplication result to the correct position. |
| 644 | const crypto_word_t m_s_low = m_s << i; |
| 645 | const crypto_word_t m_s_high = m_s >> (BITS_PER_WORD - i); |
| 646 | const crypto_word_t m_a_low = m_a << i; |
| 647 | const crypto_word_t m_a_high = m_a >> (BITS_PER_WORD - i); |
| 648 | |
| 649 | // Add into the result. |
| 650 | poly3_word_add(&r_s_low, &r_a_low, r_s_low, r_a_low, m_s_low, m_a_low); |
| 651 | poly3_word_add(&r_s_high, &r_a_high, r_s_high, r_a_high, m_s_high, |
| 652 | m_a_high); |
| 653 | } |
| 654 | |
| 655 | out->s[0] = r_s_low; |
| 656 | out->s[1] = r_s_high; |
| 657 | out->a[0] = r_a_low; |
| 658 | out->a[1] = r_a_high; |
| 659 | return; |
| 660 | } |
| 661 | |
| 662 | // Karatsuba multiplication. |
| 663 | // https://en.wikipedia.org/wiki/Karatsuba_algorithm |
| 664 | |
| 665 | // When |n| is odd, the two "halves" will have different lengths. The first |
| 666 | // is always the smaller. |
| 667 | const size_t low_len = n / 2; |
| 668 | const size_t high_len = n - low_len; |
| 669 | const struct poly3_span a_high = {&a->s[low_len], &a->a[low_len]}; |
| 670 | const struct poly3_span b_high = {&b->s[low_len], &b->a[low_len]}; |
| 671 | |
| 672 | // Store a_1 + a_0 in the first half of |out| and b_1 + b_0 in the second |
| 673 | // half. |
| 674 | const struct poly3_span a_cross_sum = *out; |
| 675 | const struct poly3_span b_cross_sum = {&out->s[high_len], &out->a[high_len]}; |
| 676 | poly3_span_add(&a_cross_sum, a, &a_high, low_len); |
| 677 | poly3_span_add(&b_cross_sum, b, &b_high, low_len); |
| 678 | if (high_len != low_len) { |
| 679 | a_cross_sum.s[low_len] = a_high.s[low_len]; |
| 680 | a_cross_sum.a[low_len] = a_high.a[low_len]; |
| 681 | b_cross_sum.s[low_len] = b_high.s[low_len]; |
| 682 | b_cross_sum.a[low_len] = b_high.a[low_len]; |
| 683 | } |
| 684 | |
| 685 | const struct poly3_span child_scratch = {&scratch->s[2 * high_len], |
| 686 | &scratch->a[2 * high_len]}; |
| 687 | const struct poly3_span out_mid = {&out->s[low_len], &out->a[low_len]}; |
| 688 | const struct poly3_span out_high = {&out->s[2 * low_len], |
| 689 | &out->a[2 * low_len]}; |
| 690 | |
| 691 | // Calculate (a_1 + a_0) × (b_1 + b_0) and write to scratch buffer. |
| 692 | poly3_mul_aux(scratch, &child_scratch, &a_cross_sum, &b_cross_sum, high_len); |
| 693 | // Calculate a_1 × b_1. |
| 694 | poly3_mul_aux(&out_high, &child_scratch, &a_high, &b_high, high_len); |
| 695 | // Calculate a_0 × b_0. |
| 696 | poly3_mul_aux(out, &child_scratch, a, b, low_len); |
| 697 | |
| 698 | // Subtract those last two products from the first. |
| 699 | poly3_span_sub(scratch, out, low_len * 2); |
| 700 | poly3_span_sub(scratch, &out_high, high_len * 2); |
| 701 | |
| 702 | // Add the middle product into the output. |
| 703 | poly3_span_add(&out_mid, &out_mid, scratch, high_len * 2); |
| 704 | } |
| 705 | |
| 706 | // HRSS_poly3_mul sets |*out| to |x|×|y| mod Φ(N). |
| 707 | void HRSS_poly3_mul(struct poly3 *out, const struct poly3 *x, |
| 708 | const struct poly3 *y) { |
| 709 | crypto_word_t prod_s[WORDS_PER_POLY * 2]; |
| 710 | crypto_word_t prod_a[WORDS_PER_POLY * 2]; |
| 711 | crypto_word_t scratch_s[WORDS_PER_POLY * 2 + 2]; |
| 712 | crypto_word_t scratch_a[WORDS_PER_POLY * 2 + 2]; |
| 713 | const struct poly3_span prod_span = {prod_s, prod_a}; |
| 714 | const struct poly3_span scratch_span = {scratch_s, scratch_a}; |
| 715 | const struct poly3_span x_span = {(crypto_word_t *)x->s.v, |
| 716 | (crypto_word_t *)x->a.v}; |
| 717 | const struct poly3_span y_span = {(crypto_word_t *)y->s.v, |
| 718 | (crypto_word_t *)y->a.v}; |
| 719 | |
| 720 | poly3_mul_aux(&prod_span, &scratch_span, &x_span, &y_span, WORDS_PER_POLY); |
| 721 | |
| 722 | // |prod| needs to be reduced mod (𝑥^n - 1), which just involves adding the |
| 723 | // upper-half to the lower-half. However, N is 701, which isn't a multiple of |
| 724 | // BITS_PER_WORD, so the upper-half vectors all have to be shifted before |
| 725 | // being added to the lower-half. |
| 726 | for (size_t i = 0; i < WORDS_PER_POLY; i++) { |
| 727 | crypto_word_t v_s = prod_s[WORDS_PER_POLY + i - 1] >> BITS_IN_LAST_WORD; |
| 728 | v_s |= prod_s[WORDS_PER_POLY + i] << (BITS_PER_WORD - BITS_IN_LAST_WORD); |
| 729 | crypto_word_t v_a = prod_a[WORDS_PER_POLY + i - 1] >> BITS_IN_LAST_WORD; |
| 730 | v_a |= prod_a[WORDS_PER_POLY + i] << (BITS_PER_WORD - BITS_IN_LAST_WORD); |
| 731 | |
| 732 | poly3_word_add(&out->s.v[i], &out->a.v[i], prod_s[i], prod_a[i], v_s, v_a); |
| 733 | } |
| 734 | |
| 735 | poly3_mod_phiN(out); |
| 736 | } |
| 737 | |
| 738 | #if defined(HRSS_HAVE_VECTOR_UNIT) && !defined(OPENSSL_AARCH64) |
| 739 | |
| 740 | // poly3_vec_cswap swaps (|a_s|, |a_a|) and (|b_s|, |b_a|) if |swap| is |
| 741 | // |0xff..ff|. Otherwise, |swap| must be zero. |
| 742 | static inline void poly3_vec_cswap(vec_t a_s[6], vec_t a_a[6], vec_t b_s[6], |
| 743 | vec_t b_a[6], const vec_t swap) { |
| 744 | for (int i = 0; i < 6; i++) { |
| 745 | const vec_t sum_s = swap & (a_s[i] ^ b_s[i]); |
| 746 | a_s[i] ^= sum_s; |
| 747 | b_s[i] ^= sum_s; |
| 748 | |
| 749 | const vec_t sum_a = swap & (a_a[i] ^ b_a[i]); |
| 750 | a_a[i] ^= sum_a; |
| 751 | b_a[i] ^= sum_a; |
| 752 | } |
| 753 | } |
| 754 | |
Adam Langley | eadef47 | 2019-01-16 17:20:24 -0800 | [diff] [blame] | 755 | // poly3_vec_fmsub subtracts (|ms|, |ma|) × (|b_s|, |b_a|) from (|a_s|, |a_a|). |
| 756 | static inline void poly3_vec_fmsub(vec_t a_s[6], vec_t a_a[6], vec_t b_s[6], |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 757 | vec_t b_a[6], const vec_t ms, |
| 758 | const vec_t ma) { |
| 759 | for (int i = 0; i < 6; i++) { |
Adam Langley | eadef47 | 2019-01-16 17:20:24 -0800 | [diff] [blame] | 760 | // See the bitslice formula, above. |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 761 | const vec_t s = b_s[i]; |
| 762 | const vec_t a = b_a[i]; |
Adam Langley | eadef47 | 2019-01-16 17:20:24 -0800 | [diff] [blame] | 763 | const vec_t product_a = a & ma; |
| 764 | const vec_t product_s = (s ^ ms) & product_a; |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 765 | |
Adam Langley | eadef47 | 2019-01-16 17:20:24 -0800 | [diff] [blame] | 766 | const vec_t out_s = a_s[i]; |
| 767 | const vec_t out_a = a_a[i]; |
| 768 | const vec_t t = out_a ^ product_a; |
| 769 | a_s[i] = (out_s ^ product_a) & (t ^ product_s); |
| 770 | a_a[i] = t | (out_s ^ product_s); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 771 | } |
| 772 | } |
| 773 | |
| 774 | // poly3_invert_vec sets |*out| to |in|^-1, i.e. such that |out|×|in| == 1 mod |
| 775 | // Φ(N). |
| 776 | static void poly3_invert_vec(struct poly3 *out, const struct poly3 *in) { |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 777 | // This algorithm is taken from section 7.1 of [SAFEGCD]. |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 778 | const vec_t kZero = {0}; |
| 779 | const vec_t kOne = {1}; |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 780 | static const uint8_t kBottomSixtyOne[sizeof(vec_t)] = { |
| 781 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x1f}; |
| 782 | |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 783 | vec_t v_s[6], v_a[6], r_s[6], r_a[6], f_s[6], f_a[6], g_s[6], g_a[6]; |
| 784 | // v = 0 |
| 785 | memset(&v_s, 0, sizeof(v_s)); |
| 786 | memset(&v_a, 0, sizeof(v_a)); |
| 787 | // r = 1 |
| 788 | memset(&r_s, 0, sizeof(r_s)); |
| 789 | memset(&r_a, 0, sizeof(r_a)); |
| 790 | r_a[0] = kOne; |
| 791 | // f = all ones. |
| 792 | memset(f_s, 0, sizeof(f_s)); |
| 793 | memset(f_a, 0xff, 5 * sizeof(vec_t)); |
| 794 | memcpy(&f_a[5], kBottomSixtyOne, sizeof(kBottomSixtyOne)); |
| 795 | // g is the reversal of |in|. |
| 796 | struct poly3 in_reversed; |
| 797 | poly3_reverse_700(&in_reversed, in); |
| 798 | g_s[5] = kZero; |
| 799 | memcpy(&g_s, &in_reversed.s.v, WORDS_PER_POLY * sizeof(crypto_word_t)); |
| 800 | g_a[5] = kZero; |
| 801 | memcpy(&g_a, &in_reversed.a.v, WORDS_PER_POLY * sizeof(crypto_word_t)); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 802 | |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 803 | int delta = 1; |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 804 | |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 805 | for (size_t i = 0; i < (2*(N-1)) - 1; i++) { |
| 806 | poly3_vec_lshift1(v_s, v_a); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 807 | |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 808 | const crypto_word_t delta_sign_bit = (delta >> (sizeof(delta) * 8 - 1)) & 1; |
| 809 | const crypto_word_t delta_is_non_negative = delta_sign_bit - 1; |
| 810 | const crypto_word_t delta_is_non_zero = ~constant_time_is_zero_w(delta); |
| 811 | const vec_t g_has_constant_term = vec_broadcast_bit(g_a[0]); |
| 812 | const vec_t mask_w = |
| 813 | {delta_is_non_negative & delta_is_non_zero}; |
| 814 | const vec_t mask = vec_broadcast_bit(mask_w) & g_has_constant_term; |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 815 | |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 816 | const vec_t c_a = vec_broadcast_bit(f_a[0] & g_a[0]); |
| 817 | const vec_t c_s = vec_broadcast_bit((f_s[0] ^ g_s[0]) & c_a); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 818 | |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 819 | delta = constant_time_select_int(lsb_to_all(mask[0]), -delta, delta); |
| 820 | delta++; |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 821 | |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 822 | poly3_vec_cswap(f_s, f_a, g_s, g_a, mask); |
| 823 | poly3_vec_fmsub(g_s, g_a, f_s, f_a, c_s, c_a); |
| 824 | poly3_vec_rshift1(g_s, g_a); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 825 | |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 826 | poly3_vec_cswap(v_s, v_a, r_s, r_a, mask); |
| 827 | poly3_vec_fmsub(r_s, r_a, v_s, v_a, c_s, c_a); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 828 | } |
| 829 | |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 830 | assert(delta == 0); |
| 831 | memcpy(out->s.v, v_s, WORDS_PER_POLY * sizeof(crypto_word_t)); |
| 832 | memcpy(out->a.v, v_a, WORDS_PER_POLY * sizeof(crypto_word_t)); |
| 833 | poly3_mul_const(out, vec_get_word(f_s[0], 0), vec_get_word(f_a[0], 0)); |
| 834 | poly3_reverse_700(out, out); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 835 | } |
| 836 | |
| 837 | #endif // HRSS_HAVE_VECTOR_UNIT |
| 838 | |
| 839 | // HRSS_poly3_invert sets |*out| to |in|^-1, i.e. such that |out|×|in| == 1 mod |
| 840 | // Φ(N). |
| 841 | void HRSS_poly3_invert(struct poly3 *out, const struct poly3 *in) { |
| 842 | // The vector version of this function seems slightly slower on AArch64, but |
| 843 | // is useful on ARMv7 and x86-64. |
| 844 | #if defined(HRSS_HAVE_VECTOR_UNIT) && !defined(OPENSSL_AARCH64) |
| 845 | if (vec_capable()) { |
| 846 | poly3_invert_vec(out, in); |
| 847 | return; |
| 848 | } |
| 849 | #endif |
| 850 | |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 851 | // This algorithm is taken from section 7.1 of [SAFEGCD]. |
| 852 | struct poly3 v, r, f, g; |
| 853 | // v = 0 |
| 854 | poly3_zero(&v); |
| 855 | // r = 1 |
| 856 | poly3_zero(&r); |
| 857 | r.a.v[0] = 1; |
| 858 | // f = all ones. |
| 859 | OPENSSL_memset(&f.s, 0, sizeof(struct poly2)); |
| 860 | OPENSSL_memset(&f.a, 0xff, sizeof(struct poly2)); |
| 861 | f.a.v[WORDS_PER_POLY - 1] >>= BITS_PER_WORD - BITS_IN_LAST_WORD; |
| 862 | // g is the reversal of |in|. |
| 863 | poly3_reverse_700(&g, in); |
| 864 | int delta = 1; |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 865 | |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 866 | for (size_t i = 0; i < (2*(N-1)) - 1; i++) { |
| 867 | poly3_lshift1(&v); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 868 | |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 869 | const crypto_word_t delta_sign_bit = (delta >> (sizeof(delta) * 8 - 1)) & 1; |
| 870 | const crypto_word_t delta_is_non_negative = delta_sign_bit - 1; |
| 871 | const crypto_word_t delta_is_non_zero = ~constant_time_is_zero_w(delta); |
| 872 | const crypto_word_t g_has_constant_term = lsb_to_all(g.a.v[0]); |
| 873 | const crypto_word_t mask = |
| 874 | g_has_constant_term & delta_is_non_negative & delta_is_non_zero; |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 875 | |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 876 | crypto_word_t c_s, c_a; |
| 877 | poly3_word_mul(&c_s, &c_a, f.s.v[0], f.a.v[0], g.s.v[0], g.a.v[0]); |
| 878 | c_s = lsb_to_all(c_s); |
| 879 | c_a = lsb_to_all(c_a); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 880 | |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 881 | delta = constant_time_select_int(mask, -delta, delta); |
| 882 | delta++; |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 883 | |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 884 | poly3_cswap(&f, &g, mask); |
| 885 | poly3_fmsub(&g, &f, c_s, c_a); |
| 886 | poly3_rshift1(&g); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 887 | |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 888 | poly3_cswap(&v, &r, mask); |
| 889 | poly3_fmsub(&r, &v, c_s, c_a); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 890 | } |
| 891 | |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 892 | assert(delta == 0); |
| 893 | poly3_mul_const(&v, f.s.v[0], f.a.v[0]); |
| 894 | poly3_reverse_700(out, &v); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 895 | } |
| 896 | |
| 897 | // Polynomials in Q. |
| 898 | |
| 899 | // Coefficients are reduced mod Q. (Q is clearly not prime, therefore the |
| 900 | // coefficients do not form a field.) |
| 901 | #define Q 8192 |
| 902 | |
| 903 | // VECS_PER_POLY is the number of 128-bit vectors needed to represent a |
| 904 | // polynomial. |
| 905 | #define COEFFICIENTS_PER_VEC (sizeof(vec_t) / sizeof(uint16_t)) |
| 906 | #define VECS_PER_POLY ((N + COEFFICIENTS_PER_VEC - 1) / COEFFICIENTS_PER_VEC) |
| 907 | |
| 908 | // poly represents a polynomial with coefficients mod Q. Note that, while Q is a |
| 909 | // power of two, this does not operate in GF(Q). That would be a binary field |
| 910 | // but this is simply mod Q. Thus the coefficients are not a field. |
| 911 | // |
| 912 | // Coefficients are ordered little-endian, thus the coefficient of x^0 is the |
| 913 | // first element of the array. |
| 914 | struct poly { |
| 915 | #if defined(HRSS_HAVE_VECTOR_UNIT) |
| 916 | union { |
| 917 | // N + 3 = 704, which is a multiple of 64 and thus aligns things, esp for |
| 918 | // the vector code. |
| 919 | uint16_t v[N + 3]; |
| 920 | vec_t vectors[VECS_PER_POLY]; |
| 921 | }; |
| 922 | #else |
Adam Langley | 1ea083d | 2018-12-12 10:46:13 -0800 | [diff] [blame] | 923 | // Even if !HRSS_HAVE_VECTOR_UNIT, external assembly may be called that |
| 924 | // requires alignment. |
| 925 | alignas(16) uint16_t v[N + 3]; |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 926 | #endif |
| 927 | }; |
| 928 | |
Adam Langley | f7e1a94 | 2022-04-01 11:46:32 -0700 | [diff] [blame] | 929 | // poly_normalize zeros out the excess elements of |x| which are included only |
| 930 | // for alignment. |
| 931 | static void poly_normalize(struct poly *x) { |
| 932 | OPENSSL_memset(&x->v[N], 0, 3 * sizeof(uint16_t)); |
| 933 | } |
| 934 | |
| 935 | // poly_assert_normalized asserts that the excess elements of |x| are zeroed out |
| 936 | // for the cases that case. (E.g. |poly_mul_vec|.) |
| 937 | static void poly_assert_normalized(const struct poly *x) { |
| 938 | assert(x->v[N] == 0); |
| 939 | assert(x->v[N + 1] == 0); |
| 940 | assert(x->v[N + 2] == 0); |
| 941 | } |
| 942 | |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 943 | OPENSSL_UNUSED static void poly_print(const struct poly *p) { |
| 944 | printf("["); |
| 945 | for (unsigned i = 0; i < N; i++) { |
| 946 | if (i) { |
| 947 | printf(" "); |
| 948 | } |
| 949 | printf("%d", p->v[i]); |
| 950 | } |
| 951 | printf("]\n"); |
| 952 | } |
| 953 | |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 954 | // POLY_MUL_SCRATCH contains space for the working variables needed by |
| 955 | // |poly_mul|. The contents afterwards may be discarded, but the object may also |
| 956 | // be reused with future |poly_mul| calls to save heap allocations. |
| 957 | // |
| 958 | // This object must have 32-byte alignment. |
| 959 | struct POLY_MUL_SCRATCH { |
| 960 | union { |
| 961 | // This is used by |poly_mul_novec|. |
| 962 | struct { |
| 963 | uint16_t prod[2 * N]; |
| 964 | uint16_t scratch[1318]; |
| 965 | } novec; |
| 966 | |
| 967 | #if defined(HRSS_HAVE_VECTOR_UNIT) |
| 968 | // This is used by |poly_mul_vec|. |
| 969 | struct { |
| 970 | vec_t prod[VECS_PER_POLY * 2]; |
| 971 | vec_t scratch[172]; |
| 972 | } vec; |
| 973 | #endif |
Adam Langley | 04b3a96 | 2023-02-08 21:08:49 +0000 | [diff] [blame] | 974 | |
| 975 | #if defined(POLY_RQ_MUL_ASM) |
| 976 | // This is the space used by |poly_Rq_mul|. |
| 977 | uint8_t rq[POLY_MUL_RQ_SCRATCH_SPACE]; |
| 978 | #endif |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 979 | } u; |
| 980 | }; |
| 981 | |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 982 | #if defined(HRSS_HAVE_VECTOR_UNIT) |
| 983 | |
| 984 | // poly_mul_vec_aux is a recursive function that multiplies |n| words from |a| |
| 985 | // and |b| and writes 2×|n| words to |out|. Each call uses 2*ceil(n/2) elements |
| 986 | // of |scratch| and the function recurses, except if |n| < 3, when |scratch| |
| 987 | // isn't used and the recursion stops. If |n| == |VECS_PER_POLY| then |scratch| |
| 988 | // needs 172 elements. |
| 989 | static void poly_mul_vec_aux(vec_t *restrict out, vec_t *restrict scratch, |
| 990 | const vec_t *restrict a, const vec_t *restrict b, |
| 991 | const size_t n) { |
| 992 | // In [HRSS], the technique they used for polynomial multiplication is |
| 993 | // described: they start with Toom-4 at the top level and then two layers of |
| 994 | // Karatsuba. Karatsuba is a specific instance of the general Toom–Cook |
| 995 | // decomposition, which splits an input n-ways and produces 2n-1 |
| 996 | // multiplications of those parts. So, starting with 704 coefficients (rounded |
| 997 | // up from 701 to have more factors of two), Toom-4 gives seven |
| 998 | // multiplications of degree-174 polynomials. Each round of Karatsuba (which |
| 999 | // is Toom-2) increases the number of multiplications by a factor of three |
| 1000 | // while halving the size of the values being multiplied. So two rounds gives |
| 1001 | // 63 multiplications of degree-44 polynomials. Then they (I think) form |
| 1002 | // vectors by gathering all 63 coefficients of each power together, for each |
| 1003 | // input, and doing more rounds of Karatsuba on the vectors until they bottom- |
| 1004 | // out somewhere with schoolbook multiplication. |
| 1005 | // |
| 1006 | // I tried something like that for NEON. NEON vectors are 128 bits so hold |
| 1007 | // eight coefficients. I wrote a function that did Karatsuba on eight |
| 1008 | // multiplications at the same time, using such vectors, and a Go script that |
| 1009 | // decomposed from degree-704, with Karatsuba in non-transposed form, until it |
| 1010 | // reached multiplications of degree-44. It batched up those 81 |
| 1011 | // multiplications into lots of eight with a single one left over (which was |
| 1012 | // handled directly). |
| 1013 | // |
| 1014 | // It worked, but it was significantly slower than the dumb algorithm used |
| 1015 | // below. Potentially that was because I misunderstood how [HRSS] did it, or |
| 1016 | // because Clang is bad at generating good code from NEON intrinsics on ARMv7. |
| 1017 | // (Which is true: the code generated by Clang for the below is pretty crap.) |
| 1018 | // |
| 1019 | // This algorithm is much simpler. It just does Karatsuba decomposition all |
| 1020 | // the way down and never transposes. When it gets down to degree-16 or |
| 1021 | // degree-24 values, they are multiplied using schoolbook multiplication and |
| 1022 | // vector intrinsics. The vector operations form each of the eight phase- |
| 1023 | // shifts of one of the inputs, point-wise multiply, and then add into the |
| 1024 | // result at the correct place. This means that 33% (degree-16) or 25% |
| 1025 | // (degree-24) of the multiplies and adds are wasted, but it does ok. |
| 1026 | if (n == 2) { |
| 1027 | vec_t result[4]; |
| 1028 | vec_t vec_a[3]; |
| 1029 | static const vec_t kZero = {0}; |
| 1030 | vec_a[0] = a[0]; |
| 1031 | vec_a[1] = a[1]; |
| 1032 | vec_a[2] = kZero; |
| 1033 | |
| 1034 | result[0] = vec_mul(vec_a[0], vec_get_word(b[0], 0)); |
| 1035 | result[1] = vec_mul(vec_a[1], vec_get_word(b[0], 0)); |
| 1036 | |
| 1037 | result[1] = vec_fma(result[1], vec_a[0], vec_get_word(b[1], 0)); |
| 1038 | result[2] = vec_mul(vec_a[1], vec_get_word(b[1], 0)); |
| 1039 | result[3] = kZero; |
| 1040 | |
| 1041 | vec3_rshift_word(vec_a); |
| 1042 | |
| 1043 | #define BLOCK(x, y) \ |
| 1044 | do { \ |
| 1045 | result[x + 0] = \ |
| 1046 | vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \ |
| 1047 | result[x + 1] = \ |
| 1048 | vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \ |
| 1049 | result[x + 2] = \ |
| 1050 | vec_fma(result[x + 2], vec_a[2], vec_get_word(b[y / 8], y % 8)); \ |
| 1051 | } while (0) |
| 1052 | |
| 1053 | BLOCK(0, 1); |
| 1054 | BLOCK(1, 9); |
| 1055 | |
| 1056 | vec3_rshift_word(vec_a); |
| 1057 | |
| 1058 | BLOCK(0, 2); |
| 1059 | BLOCK(1, 10); |
| 1060 | |
| 1061 | vec3_rshift_word(vec_a); |
| 1062 | |
| 1063 | BLOCK(0, 3); |
| 1064 | BLOCK(1, 11); |
| 1065 | |
| 1066 | vec3_rshift_word(vec_a); |
| 1067 | |
| 1068 | BLOCK(0, 4); |
| 1069 | BLOCK(1, 12); |
| 1070 | |
| 1071 | vec3_rshift_word(vec_a); |
| 1072 | |
| 1073 | BLOCK(0, 5); |
| 1074 | BLOCK(1, 13); |
| 1075 | |
| 1076 | vec3_rshift_word(vec_a); |
| 1077 | |
| 1078 | BLOCK(0, 6); |
| 1079 | BLOCK(1, 14); |
| 1080 | |
| 1081 | vec3_rshift_word(vec_a); |
| 1082 | |
| 1083 | BLOCK(0, 7); |
| 1084 | BLOCK(1, 15); |
| 1085 | |
| 1086 | #undef BLOCK |
| 1087 | |
| 1088 | memcpy(out, result, sizeof(result)); |
| 1089 | return; |
| 1090 | } |
| 1091 | |
| 1092 | if (n == 3) { |
| 1093 | vec_t result[6]; |
| 1094 | vec_t vec_a[4]; |
| 1095 | static const vec_t kZero = {0}; |
| 1096 | vec_a[0] = a[0]; |
| 1097 | vec_a[1] = a[1]; |
| 1098 | vec_a[2] = a[2]; |
| 1099 | vec_a[3] = kZero; |
| 1100 | |
| 1101 | result[0] = vec_mul(a[0], vec_get_word(b[0], 0)); |
| 1102 | result[1] = vec_mul(a[1], vec_get_word(b[0], 0)); |
| 1103 | result[2] = vec_mul(a[2], vec_get_word(b[0], 0)); |
| 1104 | |
| 1105 | #define BLOCK_PRE(x, y) \ |
| 1106 | do { \ |
| 1107 | result[x + 0] = \ |
| 1108 | vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \ |
| 1109 | result[x + 1] = \ |
| 1110 | vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \ |
| 1111 | result[x + 2] = vec_mul(vec_a[2], vec_get_word(b[y / 8], y % 8)); \ |
| 1112 | } while (0) |
| 1113 | |
| 1114 | BLOCK_PRE(1, 8); |
| 1115 | BLOCK_PRE(2, 16); |
| 1116 | |
| 1117 | result[5] = kZero; |
| 1118 | |
| 1119 | vec4_rshift_word(vec_a); |
| 1120 | |
| 1121 | #define BLOCK(x, y) \ |
| 1122 | do { \ |
| 1123 | result[x + 0] = \ |
| 1124 | vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \ |
| 1125 | result[x + 1] = \ |
| 1126 | vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \ |
| 1127 | result[x + 2] = \ |
| 1128 | vec_fma(result[x + 2], vec_a[2], vec_get_word(b[y / 8], y % 8)); \ |
| 1129 | result[x + 3] = \ |
| 1130 | vec_fma(result[x + 3], vec_a[3], vec_get_word(b[y / 8], y % 8)); \ |
| 1131 | } while (0) |
| 1132 | |
| 1133 | BLOCK(0, 1); |
| 1134 | BLOCK(1, 9); |
| 1135 | BLOCK(2, 17); |
| 1136 | |
| 1137 | vec4_rshift_word(vec_a); |
| 1138 | |
| 1139 | BLOCK(0, 2); |
| 1140 | BLOCK(1, 10); |
| 1141 | BLOCK(2, 18); |
| 1142 | |
| 1143 | vec4_rshift_word(vec_a); |
| 1144 | |
| 1145 | BLOCK(0, 3); |
| 1146 | BLOCK(1, 11); |
| 1147 | BLOCK(2, 19); |
| 1148 | |
| 1149 | vec4_rshift_word(vec_a); |
| 1150 | |
| 1151 | BLOCK(0, 4); |
| 1152 | BLOCK(1, 12); |
| 1153 | BLOCK(2, 20); |
| 1154 | |
| 1155 | vec4_rshift_word(vec_a); |
| 1156 | |
| 1157 | BLOCK(0, 5); |
| 1158 | BLOCK(1, 13); |
| 1159 | BLOCK(2, 21); |
| 1160 | |
| 1161 | vec4_rshift_word(vec_a); |
| 1162 | |
| 1163 | BLOCK(0, 6); |
| 1164 | BLOCK(1, 14); |
| 1165 | BLOCK(2, 22); |
| 1166 | |
| 1167 | vec4_rshift_word(vec_a); |
| 1168 | |
| 1169 | BLOCK(0, 7); |
| 1170 | BLOCK(1, 15); |
| 1171 | BLOCK(2, 23); |
| 1172 | |
| 1173 | #undef BLOCK |
| 1174 | #undef BLOCK_PRE |
| 1175 | |
| 1176 | memcpy(out, result, sizeof(result)); |
| 1177 | |
| 1178 | return; |
| 1179 | } |
| 1180 | |
| 1181 | // Karatsuba multiplication. |
| 1182 | // https://en.wikipedia.org/wiki/Karatsuba_algorithm |
| 1183 | |
| 1184 | // When |n| is odd, the two "halves" will have different lengths. The first is |
| 1185 | // always the smaller. |
| 1186 | const size_t low_len = n / 2; |
| 1187 | const size_t high_len = n - low_len; |
| 1188 | const vec_t *a_high = &a[low_len]; |
| 1189 | const vec_t *b_high = &b[low_len]; |
| 1190 | |
| 1191 | // Store a_1 + a_0 in the first half of |out| and b_1 + b_0 in the second |
| 1192 | // half. |
| 1193 | for (size_t i = 0; i < low_len; i++) { |
| 1194 | out[i] = vec_add(a_high[i], a[i]); |
| 1195 | out[high_len + i] = vec_add(b_high[i], b[i]); |
| 1196 | } |
| 1197 | if (high_len != low_len) { |
| 1198 | out[low_len] = a_high[low_len]; |
| 1199 | out[high_len + low_len] = b_high[low_len]; |
| 1200 | } |
| 1201 | |
| 1202 | vec_t *const child_scratch = &scratch[2 * high_len]; |
| 1203 | // Calculate (a_1 + a_0) × (b_1 + b_0) and write to scratch buffer. |
| 1204 | poly_mul_vec_aux(scratch, child_scratch, out, &out[high_len], high_len); |
| 1205 | // Calculate a_1 × b_1. |
| 1206 | poly_mul_vec_aux(&out[low_len * 2], child_scratch, a_high, b_high, high_len); |
| 1207 | // Calculate a_0 × b_0. |
| 1208 | poly_mul_vec_aux(out, child_scratch, a, b, low_len); |
| 1209 | |
| 1210 | // Subtract those last two products from the first. |
| 1211 | for (size_t i = 0; i < low_len * 2; i++) { |
| 1212 | scratch[i] = vec_sub(scratch[i], vec_add(out[i], out[low_len * 2 + i])); |
| 1213 | } |
| 1214 | if (low_len != high_len) { |
| 1215 | scratch[low_len * 2] = vec_sub(scratch[low_len * 2], out[low_len * 4]); |
| 1216 | scratch[low_len * 2 + 1] = |
| 1217 | vec_sub(scratch[low_len * 2 + 1], out[low_len * 4 + 1]); |
| 1218 | } |
| 1219 | |
| 1220 | // Add the middle product into the output. |
| 1221 | for (size_t i = 0; i < high_len * 2; i++) { |
| 1222 | out[low_len + i] = vec_add(out[low_len + i], scratch[i]); |
| 1223 | } |
| 1224 | } |
| 1225 | |
| 1226 | // poly_mul_vec sets |*out| to |x|×|y| mod (𝑥^n - 1). |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 1227 | static void poly_mul_vec(struct POLY_MUL_SCRATCH *scratch, struct poly *out, |
| 1228 | const struct poly *x, const struct poly *y) { |
David Benjamin | b7d6320 | 2022-07-26 13:25:02 -0700 | [diff] [blame] | 1229 | static_assert(sizeof(out->v) == sizeof(vec_t) * VECS_PER_POLY, |
| 1230 | "struct poly is the wrong size"); |
| 1231 | static_assert(alignof(struct poly) == alignof(vec_t), |
| 1232 | "struct poly has incorrect alignment"); |
Adam Langley | f7e1a94 | 2022-04-01 11:46:32 -0700 | [diff] [blame] | 1233 | poly_assert_normalized(x); |
| 1234 | poly_assert_normalized(y); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1235 | |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 1236 | vec_t *const prod = scratch->u.vec.prod; |
| 1237 | vec_t *const aux_scratch = scratch->u.vec.scratch; |
| 1238 | poly_mul_vec_aux(prod, aux_scratch, x->vectors, y->vectors, VECS_PER_POLY); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1239 | |
| 1240 | // |prod| needs to be reduced mod (𝑥^n - 1), which just involves adding the |
| 1241 | // upper-half to the lower-half. However, N is 701, which isn't a multiple of |
| 1242 | // the vector size, so the upper-half vectors all have to be shifted before |
| 1243 | // being added to the lower-half. |
| 1244 | vec_t *out_vecs = (vec_t *)out->v; |
| 1245 | |
| 1246 | for (size_t i = 0; i < VECS_PER_POLY; i++) { |
| 1247 | const vec_t prev = prod[VECS_PER_POLY - 1 + i]; |
| 1248 | const vec_t this = prod[VECS_PER_POLY + i]; |
| 1249 | out_vecs[i] = vec_add(prod[i], vec_merge_3_5(prev, this)); |
| 1250 | } |
| 1251 | |
| 1252 | OPENSSL_memset(&out->v[N], 0, 3 * sizeof(uint16_t)); |
| 1253 | } |
| 1254 | |
| 1255 | #endif // HRSS_HAVE_VECTOR_UNIT |
| 1256 | |
| 1257 | // poly_mul_novec_aux writes the product of |a| and |b| to |out|, using |
| 1258 | // |scratch| as scratch space. It'll use Karatsuba if the inputs are large |
| 1259 | // enough to warrant it. Each call uses 2*ceil(n/2) elements of |scratch| and |
| 1260 | // the function recurses, except if |n| < 64, when |scratch| isn't used and the |
| 1261 | // recursion stops. If |n| == |N| then |scratch| needs 1318 elements. |
| 1262 | static void poly_mul_novec_aux(uint16_t *out, uint16_t *scratch, |
| 1263 | const uint16_t *a, const uint16_t *b, size_t n) { |
| 1264 | static const size_t kSchoolbookLimit = 64; |
| 1265 | if (n < kSchoolbookLimit) { |
| 1266 | OPENSSL_memset(out, 0, sizeof(uint16_t) * n * 2); |
| 1267 | for (size_t i = 0; i < n; i++) { |
| 1268 | for (size_t j = 0; j < n; j++) { |
| 1269 | out[i + j] += (unsigned) a[i] * b[j]; |
| 1270 | } |
| 1271 | } |
| 1272 | |
| 1273 | return; |
| 1274 | } |
| 1275 | |
| 1276 | // Karatsuba multiplication. |
| 1277 | // https://en.wikipedia.org/wiki/Karatsuba_algorithm |
| 1278 | |
| 1279 | // When |n| is odd, the two "halves" will have different lengths. The |
| 1280 | // first is always the smaller. |
| 1281 | const size_t low_len = n / 2; |
| 1282 | const size_t high_len = n - low_len; |
| 1283 | const uint16_t *const a_high = &a[low_len]; |
| 1284 | const uint16_t *const b_high = &b[low_len]; |
| 1285 | |
| 1286 | for (size_t i = 0; i < low_len; i++) { |
| 1287 | out[i] = a_high[i] + a[i]; |
| 1288 | out[high_len + i] = b_high[i] + b[i]; |
| 1289 | } |
| 1290 | if (high_len != low_len) { |
| 1291 | out[low_len] = a_high[low_len]; |
| 1292 | out[high_len + low_len] = b_high[low_len]; |
| 1293 | } |
| 1294 | |
| 1295 | uint16_t *const child_scratch = &scratch[2 * high_len]; |
| 1296 | poly_mul_novec_aux(scratch, child_scratch, out, &out[high_len], high_len); |
| 1297 | poly_mul_novec_aux(&out[low_len * 2], child_scratch, a_high, b_high, |
| 1298 | high_len); |
| 1299 | poly_mul_novec_aux(out, child_scratch, a, b, low_len); |
| 1300 | |
| 1301 | for (size_t i = 0; i < low_len * 2; i++) { |
| 1302 | scratch[i] -= out[i] + out[low_len * 2 + i]; |
| 1303 | } |
| 1304 | if (low_len != high_len) { |
| 1305 | scratch[low_len * 2] -= out[low_len * 4]; |
| 1306 | assert(out[low_len * 4 + 1] == 0); |
| 1307 | } |
| 1308 | |
| 1309 | for (size_t i = 0; i < high_len * 2; i++) { |
| 1310 | out[low_len + i] += scratch[i]; |
| 1311 | } |
| 1312 | } |
| 1313 | |
| 1314 | // poly_mul_novec sets |*out| to |x|×|y| mod (𝑥^n - 1). |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 1315 | static void poly_mul_novec(struct POLY_MUL_SCRATCH *scratch, struct poly *out, |
| 1316 | const struct poly *x, const struct poly *y) { |
| 1317 | uint16_t *const prod = scratch->u.novec.prod; |
| 1318 | uint16_t *const aux_scratch = scratch->u.novec.scratch; |
| 1319 | poly_mul_novec_aux(prod, aux_scratch, x->v, y->v, N); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1320 | |
| 1321 | for (size_t i = 0; i < N; i++) { |
| 1322 | out->v[i] = prod[i] + prod[i + N]; |
| 1323 | } |
| 1324 | OPENSSL_memset(&out->v[N], 0, 3 * sizeof(uint16_t)); |
| 1325 | } |
| 1326 | |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 1327 | static void poly_mul(struct POLY_MUL_SCRATCH *scratch, struct poly *r, |
| 1328 | const struct poly *a, const struct poly *b) { |
Adam Langley | 04b3a96 | 2023-02-08 21:08:49 +0000 | [diff] [blame] | 1329 | #if defined(POLY_RQ_MUL_ASM) |
| 1330 | if (CRYPTO_is_AVX2_capable()) { |
| 1331 | poly_Rq_mul(r->v, a->v, b->v, scratch->u.rq); |
| 1332 | poly_normalize(r); |
| 1333 | } else |
| 1334 | #endif |
| 1335 | |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1336 | #if defined(HRSS_HAVE_VECTOR_UNIT) |
| 1337 | if (vec_capable()) { |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 1338 | poly_mul_vec(scratch, r, a, b); |
Adam Langley | f7e1a94 | 2022-04-01 11:46:32 -0700 | [diff] [blame] | 1339 | } else |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1340 | #endif |
| 1341 | |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1342 | // Fallback, non-vector case. |
Adam Langley | f7e1a94 | 2022-04-01 11:46:32 -0700 | [diff] [blame] | 1343 | { |
| 1344 | poly_mul_novec(scratch, r, a, b); |
| 1345 | } |
| 1346 | |
| 1347 | poly_assert_normalized(r); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1348 | } |
| 1349 | |
| 1350 | // poly_mul_x_minus_1 sets |p| to |p|×(𝑥 - 1) mod (𝑥^n - 1). |
| 1351 | static void poly_mul_x_minus_1(struct poly *p) { |
| 1352 | // Multiplying by (𝑥 - 1) means negating each coefficient and adding in |
| 1353 | // the value of the previous one. |
| 1354 | const uint16_t orig_final_coefficient = p->v[N - 1]; |
| 1355 | |
| 1356 | for (size_t i = N - 1; i > 0; i--) { |
| 1357 | p->v[i] = p->v[i - 1] - p->v[i]; |
| 1358 | } |
| 1359 | p->v[0] = orig_final_coefficient - p->v[0]; |
| 1360 | } |
| 1361 | |
| 1362 | // poly_mod_phiN sets |p| to |p| mod Φ(N). |
| 1363 | static void poly_mod_phiN(struct poly *p) { |
| 1364 | const uint16_t coeff700 = p->v[N - 1]; |
| 1365 | |
| 1366 | for (unsigned i = 0; i < N; i++) { |
| 1367 | p->v[i] -= coeff700; |
| 1368 | } |
| 1369 | } |
| 1370 | |
| 1371 | // poly_clamp reduces each coefficient mod Q. |
| 1372 | static void poly_clamp(struct poly *p) { |
| 1373 | for (unsigned i = 0; i < N; i++) { |
| 1374 | p->v[i] &= Q - 1; |
| 1375 | } |
| 1376 | } |
| 1377 | |
| 1378 | |
| 1379 | // Conversion functions |
| 1380 | // -------------------- |
| 1381 | |
| 1382 | // poly2_from_poly sets |*out| to |in| mod 2. |
| 1383 | static void poly2_from_poly(struct poly2 *out, const struct poly *in) { |
| 1384 | crypto_word_t *words = out->v; |
| 1385 | unsigned shift = 0; |
| 1386 | crypto_word_t word = 0; |
| 1387 | |
| 1388 | for (unsigned i = 0; i < N; i++) { |
| 1389 | word >>= 1; |
| 1390 | word |= (crypto_word_t)(in->v[i] & 1) << (BITS_PER_WORD - 1); |
| 1391 | shift++; |
| 1392 | |
| 1393 | if (shift == BITS_PER_WORD) { |
| 1394 | *words = word; |
| 1395 | words++; |
| 1396 | word = 0; |
| 1397 | shift = 0; |
| 1398 | } |
| 1399 | } |
| 1400 | |
| 1401 | word >>= BITS_PER_WORD - shift; |
| 1402 | *words = word; |
| 1403 | } |
| 1404 | |
Adam Langley | 72f0155 | 2019-01-21 12:53:43 -0800 | [diff] [blame] | 1405 | // mod3 treats |a| as a signed number and returns |a| mod 3. |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1406 | static uint16_t mod3(int16_t a) { |
| 1407 | const int16_t q = ((int32_t)a * 21845) >> 16; |
| 1408 | int16_t ret = a - 3 * q; |
| 1409 | // At this point, |ret| is in {0, 1, 2, 3} and that needs to be mapped to {0, |
| 1410 | // 1, 2, 0}. |
| 1411 | return ret & ((ret & (ret >> 1)) - 1); |
| 1412 | } |
| 1413 | |
| 1414 | // poly3_from_poly sets |*out| to |in|. |
| 1415 | static void poly3_from_poly(struct poly3 *out, const struct poly *in) { |
| 1416 | crypto_word_t *words_s = out->s.v; |
| 1417 | crypto_word_t *words_a = out->a.v; |
| 1418 | crypto_word_t s = 0; |
| 1419 | crypto_word_t a = 0; |
| 1420 | unsigned shift = 0; |
| 1421 | |
| 1422 | for (unsigned i = 0; i < N; i++) { |
| 1423 | // This duplicates the 13th bit upwards to the top of the uint16, |
| 1424 | // essentially treating it as a sign bit and converting into a signed int16. |
| 1425 | // The signed value is reduced mod 3, yielding {0, 1, 2}. |
| 1426 | const uint16_t v = mod3((int16_t)(in->v[i] << 3) >> 3); |
| 1427 | s >>= 1; |
Adam Langley | eadef47 | 2019-01-16 17:20:24 -0800 | [diff] [blame] | 1428 | const crypto_word_t s_bit = (crypto_word_t)(v & 2) << (BITS_PER_WORD - 2); |
| 1429 | s |= s_bit; |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1430 | a >>= 1; |
Adam Langley | eadef47 | 2019-01-16 17:20:24 -0800 | [diff] [blame] | 1431 | a |= s_bit | (crypto_word_t)(v & 1) << (BITS_PER_WORD - 1); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1432 | shift++; |
| 1433 | |
| 1434 | if (shift == BITS_PER_WORD) { |
| 1435 | *words_s = s; |
| 1436 | words_s++; |
| 1437 | *words_a = a; |
| 1438 | words_a++; |
| 1439 | s = a = 0; |
| 1440 | shift = 0; |
| 1441 | } |
| 1442 | } |
| 1443 | |
| 1444 | s >>= BITS_PER_WORD - shift; |
| 1445 | a >>= BITS_PER_WORD - shift; |
| 1446 | *words_s = s; |
| 1447 | *words_a = a; |
| 1448 | } |
| 1449 | |
| 1450 | // poly3_from_poly_checked sets |*out| to |in|, which has coefficients in {0, 1, |
| 1451 | // Q-1}. It returns a mask indicating whether all coefficients were found to be |
| 1452 | // in that set. |
| 1453 | static crypto_word_t poly3_from_poly_checked(struct poly3 *out, |
| 1454 | const struct poly *in) { |
| 1455 | crypto_word_t *words_s = out->s.v; |
| 1456 | crypto_word_t *words_a = out->a.v; |
| 1457 | crypto_word_t s = 0; |
| 1458 | crypto_word_t a = 0; |
| 1459 | unsigned shift = 0; |
| 1460 | crypto_word_t ok = CONSTTIME_TRUE_W; |
| 1461 | |
| 1462 | for (unsigned i = 0; i < N; i++) { |
| 1463 | const uint16_t v = in->v[i]; |
| 1464 | // Maps {0, 1, Q-1} to {0, 1, 2}. |
| 1465 | uint16_t mod3 = v & 3; |
| 1466 | mod3 ^= mod3 >> 1; |
| 1467 | const uint16_t expected = (uint16_t)((~((mod3 >> 1) - 1)) | mod3) % Q; |
| 1468 | ok &= constant_time_eq_w(v, expected); |
| 1469 | |
| 1470 | s >>= 1; |
Adam Langley | eadef47 | 2019-01-16 17:20:24 -0800 | [diff] [blame] | 1471 | const crypto_word_t s_bit = (crypto_word_t)(mod3 & 2) |
| 1472 | << (BITS_PER_WORD - 2); |
| 1473 | s |= s_bit; |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1474 | a >>= 1; |
Adam Langley | eadef47 | 2019-01-16 17:20:24 -0800 | [diff] [blame] | 1475 | a |= s_bit | (crypto_word_t)(mod3 & 1) << (BITS_PER_WORD - 1); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1476 | shift++; |
| 1477 | |
| 1478 | if (shift == BITS_PER_WORD) { |
| 1479 | *words_s = s; |
| 1480 | words_s++; |
| 1481 | *words_a = a; |
| 1482 | words_a++; |
| 1483 | s = a = 0; |
| 1484 | shift = 0; |
| 1485 | } |
| 1486 | } |
| 1487 | |
| 1488 | s >>= BITS_PER_WORD - shift; |
| 1489 | a >>= BITS_PER_WORD - shift; |
| 1490 | *words_s = s; |
| 1491 | *words_a = a; |
| 1492 | |
| 1493 | return ok; |
| 1494 | } |
| 1495 | |
| 1496 | static void poly_from_poly2(struct poly *out, const struct poly2 *in) { |
| 1497 | const crypto_word_t *words = in->v; |
| 1498 | unsigned shift = 0; |
| 1499 | crypto_word_t word = *words; |
| 1500 | |
| 1501 | for (unsigned i = 0; i < N; i++) { |
| 1502 | out->v[i] = word & 1; |
| 1503 | word >>= 1; |
| 1504 | shift++; |
| 1505 | |
| 1506 | if (shift == BITS_PER_WORD) { |
| 1507 | words++; |
| 1508 | word = *words; |
| 1509 | shift = 0; |
| 1510 | } |
| 1511 | } |
Adam Langley | f7e1a94 | 2022-04-01 11:46:32 -0700 | [diff] [blame] | 1512 | |
| 1513 | poly_normalize(out); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1514 | } |
| 1515 | |
| 1516 | static void poly_from_poly3(struct poly *out, const struct poly3 *in) { |
| 1517 | const crypto_word_t *words_s = in->s.v; |
| 1518 | const crypto_word_t *words_a = in->a.v; |
| 1519 | crypto_word_t word_s = ~(*words_s); |
| 1520 | crypto_word_t word_a = *words_a; |
| 1521 | unsigned shift = 0; |
| 1522 | |
| 1523 | for (unsigned i = 0; i < N; i++) { |
| 1524 | out->v[i] = (uint16_t)(word_s & 1) - 1; |
| 1525 | out->v[i] |= word_a & 1; |
| 1526 | word_s >>= 1; |
| 1527 | word_a >>= 1; |
| 1528 | shift++; |
| 1529 | |
| 1530 | if (shift == BITS_PER_WORD) { |
| 1531 | words_s++; |
| 1532 | words_a++; |
| 1533 | word_s = ~(*words_s); |
| 1534 | word_a = *words_a; |
| 1535 | shift = 0; |
| 1536 | } |
| 1537 | } |
Adam Langley | f7e1a94 | 2022-04-01 11:46:32 -0700 | [diff] [blame] | 1538 | |
| 1539 | poly_normalize(out); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1540 | } |
| 1541 | |
| 1542 | // Polynomial inversion |
| 1543 | // -------------------- |
| 1544 | |
| 1545 | // poly_invert_mod2 sets |*out| to |in^-1| (i.e. such that |*out|×|in| = 1 mod |
| 1546 | // Φ(N)), all mod 2. This isn't useful in itself, but is part of doing inversion |
| 1547 | // mod Q. |
| 1548 | static void poly_invert_mod2(struct poly *out, const struct poly *in) { |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 1549 | // This algorithm is taken from section 7.1 of [SAFEGCD]. |
| 1550 | struct poly2 v, r, f, g; |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1551 | |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 1552 | // v = 0 |
| 1553 | poly2_zero(&v); |
| 1554 | // r = 1 |
| 1555 | poly2_zero(&r); |
| 1556 | r.v[0] = 1; |
| 1557 | // f = all ones. |
| 1558 | OPENSSL_memset(&f, 0xff, sizeof(struct poly2)); |
| 1559 | f.v[WORDS_PER_POLY - 1] >>= BITS_PER_WORD - BITS_IN_LAST_WORD; |
| 1560 | // g is the reversal of |in|. |
| 1561 | poly2_from_poly(&g, in); |
| 1562 | poly2_mod_phiN(&g); |
| 1563 | poly2_reverse_700(&g, &g); |
| 1564 | int delta = 1; |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1565 | |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 1566 | for (size_t i = 0; i < (2*(N-1)) - 1; i++) { |
| 1567 | poly2_lshift1(&v); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1568 | |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 1569 | const crypto_word_t delta_sign_bit = (delta >> (sizeof(delta) * 8 - 1)) & 1; |
| 1570 | const crypto_word_t delta_is_non_negative = delta_sign_bit - 1; |
| 1571 | const crypto_word_t delta_is_non_zero = ~constant_time_is_zero_w(delta); |
| 1572 | const crypto_word_t g_has_constant_term = lsb_to_all(g.v[0]); |
| 1573 | const crypto_word_t mask = |
| 1574 | g_has_constant_term & delta_is_non_negative & delta_is_non_zero; |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1575 | |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 1576 | const crypto_word_t c = lsb_to_all(f.v[0] & g.v[0]); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1577 | |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 1578 | delta = constant_time_select_int(mask, -delta, delta); |
| 1579 | delta++; |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1580 | |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 1581 | poly2_cswap(&f, &g, mask); |
| 1582 | poly2_fmadd(&g, &f, c); |
| 1583 | poly2_rshift1(&g); |
| 1584 | |
| 1585 | poly2_cswap(&v, &r, mask); |
| 1586 | poly2_fmadd(&r, &v, c); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1587 | } |
| 1588 | |
Adam Langley | cbae965 | 2019-12-06 14:58:52 -0800 | [diff] [blame] | 1589 | assert(delta == 0); |
| 1590 | assert(f.v[0] & 1); |
| 1591 | poly2_reverse_700(&v, &v); |
| 1592 | poly_from_poly2(out, &v); |
Adam Langley | f7e1a94 | 2022-04-01 11:46:32 -0700 | [diff] [blame] | 1593 | poly_assert_normalized(out); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1594 | } |
| 1595 | |
| 1596 | // poly_invert sets |*out| to |in^-1| (i.e. such that |*out|×|in| = 1 mod Φ(N)). |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 1597 | static void poly_invert(struct POLY_MUL_SCRATCH *scratch, struct poly *out, |
| 1598 | const struct poly *in) { |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1599 | // Inversion mod Q, which is done based on the result of inverting mod |
| 1600 | // 2. See [NTRUTN14] paper, bottom of page two. |
| 1601 | struct poly a, *b, tmp; |
| 1602 | |
| 1603 | // a = -in. |
| 1604 | for (unsigned i = 0; i < N; i++) { |
| 1605 | a.v[i] = -in->v[i]; |
| 1606 | } |
Adam Langley | f7e1a94 | 2022-04-01 11:46:32 -0700 | [diff] [blame] | 1607 | poly_normalize(&a); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1608 | |
| 1609 | // b = in^-1 mod 2. |
| 1610 | b = out; |
| 1611 | poly_invert_mod2(b, in); |
| 1612 | |
| 1613 | // We are working mod Q=2**13 and we need to iterate ceil(log_2(13)) |
| 1614 | // times, which is four. |
| 1615 | for (unsigned i = 0; i < 4; i++) { |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 1616 | poly_mul(scratch, &tmp, &a, b); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1617 | tmp.v[0] += 2; |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 1618 | poly_mul(scratch, b, b, &tmp); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1619 | } |
Adam Langley | f7e1a94 | 2022-04-01 11:46:32 -0700 | [diff] [blame] | 1620 | |
| 1621 | poly_assert_normalized(out); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1622 | } |
| 1623 | |
| 1624 | // Marshal and unmarshal functions for various basic types. |
| 1625 | // -------------------------------------------------------- |
| 1626 | |
| 1627 | #define POLY_BYTES 1138 |
| 1628 | |
Adam Langley | 9700b44 | 2018-12-17 11:17:26 -0800 | [diff] [blame] | 1629 | // poly_marshal serialises all but the final coefficient of |in| to |out|. |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1630 | static void poly_marshal(uint8_t out[POLY_BYTES], const struct poly *in) { |
| 1631 | const uint16_t *p = in->v; |
| 1632 | |
| 1633 | for (size_t i = 0; i < N / 8; i++) { |
| 1634 | out[0] = p[0]; |
| 1635 | out[1] = (0x1f & (p[0] >> 8)) | ((p[1] & 0x07) << 5); |
| 1636 | out[2] = p[1] >> 3; |
| 1637 | out[3] = (3 & (p[1] >> 11)) | ((p[2] & 0x3f) << 2); |
| 1638 | out[4] = (0x7f & (p[2] >> 6)) | ((p[3] & 0x01) << 7); |
| 1639 | out[5] = p[3] >> 1; |
| 1640 | out[6] = (0xf & (p[3] >> 9)) | ((p[4] & 0x0f) << 4); |
| 1641 | out[7] = p[4] >> 4; |
| 1642 | out[8] = (1 & (p[4] >> 12)) | ((p[5] & 0x7f) << 1); |
| 1643 | out[9] = (0x3f & (p[5] >> 7)) | ((p[6] & 0x03) << 6); |
| 1644 | out[10] = p[6] >> 2; |
| 1645 | out[11] = (7 & (p[6] >> 10)) | ((p[7] & 0x1f) << 3); |
| 1646 | out[12] = p[7] >> 5; |
| 1647 | |
| 1648 | p += 8; |
| 1649 | out += 13; |
| 1650 | } |
| 1651 | |
| 1652 | // There are four remaining values. |
| 1653 | out[0] = p[0]; |
| 1654 | out[1] = (0x1f & (p[0] >> 8)) | ((p[1] & 0x07) << 5); |
| 1655 | out[2] = p[1] >> 3; |
| 1656 | out[3] = (3 & (p[1] >> 11)) | ((p[2] & 0x3f) << 2); |
| 1657 | out[4] = (0x7f & (p[2] >> 6)) | ((p[3] & 0x01) << 7); |
| 1658 | out[5] = p[3] >> 1; |
| 1659 | out[6] = 0xf & (p[3] >> 9); |
| 1660 | } |
| 1661 | |
Adam Langley | 9700b44 | 2018-12-17 11:17:26 -0800 | [diff] [blame] | 1662 | // poly_unmarshal parses the output of |poly_marshal| and sets |out| such that |
| 1663 | // all but the final coefficients match, and the final coefficient is calculated |
| 1664 | // such that evaluating |out| at one results in zero. It returns one on success |
| 1665 | // or zero if |in| is an invalid encoding. |
Adam Langley | f8068ce | 2018-12-17 11:16:23 -0800 | [diff] [blame] | 1666 | static int poly_unmarshal(struct poly *out, const uint8_t in[POLY_BYTES]) { |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1667 | uint16_t *p = out->v; |
| 1668 | |
| 1669 | for (size_t i = 0; i < N / 8; i++) { |
| 1670 | p[0] = (uint16_t)(in[0]) | (uint16_t)(in[1] & 0x1f) << 8; |
| 1671 | p[1] = (uint16_t)(in[1] >> 5) | (uint16_t)(in[2]) << 3 | |
| 1672 | (uint16_t)(in[3] & 3) << 11; |
| 1673 | p[2] = (uint16_t)(in[3] >> 2) | (uint16_t)(in[4] & 0x7f) << 6; |
| 1674 | p[3] = (uint16_t)(in[4] >> 7) | (uint16_t)(in[5]) << 1 | |
| 1675 | (uint16_t)(in[6] & 0xf) << 9; |
| 1676 | p[4] = (uint16_t)(in[6] >> 4) | (uint16_t)(in[7]) << 4 | |
| 1677 | (uint16_t)(in[8] & 1) << 12; |
| 1678 | p[5] = (uint16_t)(in[8] >> 1) | (uint16_t)(in[9] & 0x3f) << 7; |
| 1679 | p[6] = (uint16_t)(in[9] >> 6) | (uint16_t)(in[10]) << 2 | |
| 1680 | (uint16_t)(in[11] & 7) << 10; |
| 1681 | p[7] = (uint16_t)(in[11] >> 3) | (uint16_t)(in[12]) << 5; |
| 1682 | |
| 1683 | p += 8; |
| 1684 | in += 13; |
| 1685 | } |
| 1686 | |
| 1687 | // There are four coefficients remaining. |
| 1688 | p[0] = (uint16_t)(in[0]) | (uint16_t)(in[1] & 0x1f) << 8; |
| 1689 | p[1] = (uint16_t)(in[1] >> 5) | (uint16_t)(in[2]) << 3 | |
| 1690 | (uint16_t)(in[3] & 3) << 11; |
| 1691 | p[2] = (uint16_t)(in[3] >> 2) | (uint16_t)(in[4] & 0x7f) << 6; |
| 1692 | p[3] = (uint16_t)(in[4] >> 7) | (uint16_t)(in[5]) << 1 | |
| 1693 | (uint16_t)(in[6] & 0xf) << 9; |
| 1694 | |
| 1695 | for (unsigned i = 0; i < N - 1; i++) { |
| 1696 | out->v[i] = (int16_t)(out->v[i] << 3) >> 3; |
| 1697 | } |
| 1698 | |
Adam Langley | f8068ce | 2018-12-17 11:16:23 -0800 | [diff] [blame] | 1699 | // There are four unused bits in the last byte. We require them to be zero. |
| 1700 | if ((in[6] & 0xf0) != 0) { |
| 1701 | return 0; |
| 1702 | } |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1703 | |
| 1704 | // Set the final coefficient as specifed in [HRSSNIST] 1.9.2 step 6. |
| 1705 | uint32_t sum = 0; |
| 1706 | for (size_t i = 0; i < N - 1; i++) { |
| 1707 | sum += out->v[i]; |
| 1708 | } |
| 1709 | |
| 1710 | out->v[N - 1] = (uint16_t)(0u - sum); |
Adam Langley | f7e1a94 | 2022-04-01 11:46:32 -0700 | [diff] [blame] | 1711 | poly_normalize(out); |
Adam Langley | f8068ce | 2018-12-17 11:16:23 -0800 | [diff] [blame] | 1712 | |
| 1713 | return 1; |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1714 | } |
| 1715 | |
| 1716 | // mod3_from_modQ maps {0, 1, Q-1, 65535} -> {0, 1, 2, 2}. Note that |v| may |
| 1717 | // have an invalid value when processing attacker-controlled inputs. |
| 1718 | static uint16_t mod3_from_modQ(uint16_t v) { |
| 1719 | v &= 3; |
| 1720 | return v ^ (v >> 1); |
| 1721 | } |
| 1722 | |
| 1723 | // poly_marshal_mod3 marshals |in| to |out| where the coefficients of |in| are |
| 1724 | // all in {0, 1, Q-1, 65535} and |in| is mod Φ(N). (Note that coefficients may |
| 1725 | // have invalid values when processing attacker-controlled inputs.) |
| 1726 | static void poly_marshal_mod3(uint8_t out[HRSS_POLY3_BYTES], |
| 1727 | const struct poly *in) { |
| 1728 | const uint16_t *coeffs = in->v; |
| 1729 | |
| 1730 | // Only 700 coefficients are marshaled because in[700] must be zero. |
| 1731 | assert(coeffs[N-1] == 0); |
| 1732 | |
| 1733 | for (size_t i = 0; i < HRSS_POLY3_BYTES; i++) { |
| 1734 | const uint16_t coeffs0 = mod3_from_modQ(coeffs[0]); |
| 1735 | const uint16_t coeffs1 = mod3_from_modQ(coeffs[1]); |
| 1736 | const uint16_t coeffs2 = mod3_from_modQ(coeffs[2]); |
| 1737 | const uint16_t coeffs3 = mod3_from_modQ(coeffs[3]); |
| 1738 | const uint16_t coeffs4 = mod3_from_modQ(coeffs[4]); |
| 1739 | out[i] = coeffs0 + coeffs1 * 3 + coeffs2 * 9 + coeffs3 * 27 + coeffs4 * 81; |
| 1740 | coeffs += 5; |
| 1741 | } |
| 1742 | } |
| 1743 | |
| 1744 | // HRSS-specific functions |
| 1745 | // ----------------------- |
| 1746 | |
Adam Langley | 72f0155 | 2019-01-21 12:53:43 -0800 | [diff] [blame] | 1747 | // poly_short_sample samples a vector of values in {0xffff (i.e. -1), 0, 1}. |
| 1748 | // This is the same action as the algorithm in [HRSSNIST] section 1.8.1, but |
| 1749 | // with HRSS-SXY the sampling algorithm is now a private detail of the |
| 1750 | // implementation (previously it had to match between two parties). This |
| 1751 | // function uses that freedom to implement a flatter distribution of values. |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1752 | static void poly_short_sample(struct poly *out, |
| 1753 | const uint8_t in[HRSS_SAMPLE_BYTES]) { |
David Benjamin | b7d6320 | 2022-07-26 13:25:02 -0700 | [diff] [blame] | 1754 | static_assert(HRSS_SAMPLE_BYTES == N - 1, "HRSS_SAMPLE_BYTES incorrect"); |
Adam Langley | 72f0155 | 2019-01-21 12:53:43 -0800 | [diff] [blame] | 1755 | for (size_t i = 0; i < N - 1; i++) { |
| 1756 | uint16_t v = mod3(in[i]); |
| 1757 | // Map {0, 1, 2} -> {0, 1, 0xffff} |
| 1758 | v |= ((v >> 1) ^ 1) - 1; |
| 1759 | out->v[i] = v; |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1760 | } |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1761 | out->v[N - 1] = 0; |
Adam Langley | f7e1a94 | 2022-04-01 11:46:32 -0700 | [diff] [blame] | 1762 | poly_normalize(out); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1763 | } |
| 1764 | |
| 1765 | // poly_short_sample_plus performs the T+ sample as defined in [HRSSNIST], |
| 1766 | // section 1.8.2. |
| 1767 | static void poly_short_sample_plus(struct poly *out, |
| 1768 | const uint8_t in[HRSS_SAMPLE_BYTES]) { |
| 1769 | poly_short_sample(out, in); |
| 1770 | |
| 1771 | // sum (and the product in the for loop) will overflow. But that's fine |
| 1772 | // because |sum| is bound by +/- (N-2), and N < 2^15 so it works out. |
| 1773 | uint16_t sum = 0; |
| 1774 | for (unsigned i = 0; i < N - 2; i++) { |
| 1775 | sum += (unsigned) out->v[i] * out->v[i + 1]; |
| 1776 | } |
| 1777 | |
| 1778 | // If the sum is negative, flip the sign of even-positioned coefficients. (See |
| 1779 | // page 8 of [HRSS].) |
| 1780 | sum = ((int16_t) sum) >> 15; |
| 1781 | const uint16_t scale = sum | (~sum & 1); |
| 1782 | for (unsigned i = 0; i < N; i += 2) { |
| 1783 | out->v[i] = (unsigned) out->v[i] * scale; |
| 1784 | } |
Adam Langley | f7e1a94 | 2022-04-01 11:46:32 -0700 | [diff] [blame] | 1785 | poly_assert_normalized(out); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1786 | } |
| 1787 | |
| 1788 | // poly_lift computes the function discussed in [HRSS], appendix B. |
| 1789 | static void poly_lift(struct poly *out, const struct poly *a) { |
| 1790 | // We wish to calculate a/(𝑥-1) mod Φ(N) over GF(3), where Φ(N) is the |
| 1791 | // Nth cyclotomic polynomial, i.e. 1 + 𝑥 + … + 𝑥^700 (since N is prime). |
| 1792 | |
| 1793 | // 1/(𝑥-1) has a fairly basic structure that we can exploit to speed this up: |
| 1794 | // |
| 1795 | // R.<x> = PolynomialRing(GF(3)…) |
| 1796 | // inv = R.cyclotomic_polynomial(1).inverse_mod(R.cyclotomic_polynomial(n)) |
| 1797 | // list(inv)[:15] |
| 1798 | // [1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2] |
| 1799 | // |
| 1800 | // This three-element pattern of coefficients repeats for the whole |
| 1801 | // polynomial. |
| 1802 | // |
| 1803 | // Next define the overbar operator such that z̅ = z[0] + |
| 1804 | // reverse(z[1:]). (Index zero of a polynomial here is the coefficient |
| 1805 | // of the constant term. So index one is the coefficient of 𝑥 and so |
| 1806 | // on.) |
| 1807 | // |
| 1808 | // A less odd way to define this is to see that z̅ negates the indexes, |
| 1809 | // so z̅[0] = z[-0], z̅[1] = z[-1] and so on. |
| 1810 | // |
| 1811 | // The use of z̅ is that, when working mod (𝑥^701 - 1), vz[0] = <v, |
| 1812 | // z̅>, vz[1] = <v, 𝑥z̅>, …. (Where <a, b> is the inner product: the sum |
| 1813 | // of the point-wise products.) Although we calculated the inverse mod |
| 1814 | // Φ(N), we can work mod (𝑥^N - 1) and reduce mod Φ(N) at the end. |
| 1815 | // (That's because (𝑥^N - 1) is a multiple of Φ(N).) |
| 1816 | // |
| 1817 | // When working mod (𝑥^N - 1), multiplication by 𝑥 is a right-rotation |
| 1818 | // of the list of coefficients. |
| 1819 | // |
| 1820 | // Thus we can consider what the pattern of z̅, 𝑥z̅, 𝑥^2z̅, … looks like: |
| 1821 | // |
| 1822 | // def reverse(xs): |
| 1823 | // suffix = list(xs[1:]) |
| 1824 | // suffix.reverse() |
| 1825 | // return [xs[0]] + suffix |
| 1826 | // |
| 1827 | // def rotate(xs): |
| 1828 | // return [xs[-1]] + xs[:-1] |
| 1829 | // |
| 1830 | // zoverbar = reverse(list(inv) + [0]) |
| 1831 | // xzoverbar = rotate(reverse(list(inv) + [0])) |
| 1832 | // x2zoverbar = rotate(rotate(reverse(list(inv) + [0]))) |
| 1833 | // |
| 1834 | // zoverbar[:15] |
| 1835 | // [1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1] |
| 1836 | // xzoverbar[:15] |
| 1837 | // [0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0] |
| 1838 | // x2zoverbar[:15] |
| 1839 | // [2, 0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2] |
| 1840 | // |
| 1841 | // (For a formula for z̅, see lemma two of appendix B.) |
| 1842 | // |
| 1843 | // After the first three elements have been taken care of, all then have |
| 1844 | // a repeating three-element cycle. The next value (𝑥^3z̅) involves |
| 1845 | // three rotations of the first pattern, thus the three-element cycle |
| 1846 | // lines up. However, the discontinuity in the first three elements |
| 1847 | // obviously moves to a different position. Consider the difference |
| 1848 | // between 𝑥^3z̅ and z̅: |
| 1849 | // |
| 1850 | // [x-y for (x,y) in zip(zoverbar, x3zoverbar)][:15] |
| 1851 | // [0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] |
| 1852 | // |
| 1853 | // This pattern of differences is the same for all elements, although it |
| 1854 | // obviously moves right with the rotations. |
| 1855 | // |
| 1856 | // From this, we reach algorithm eight of appendix B. |
| 1857 | |
| 1858 | // Handle the first three elements of the inner products. |
| 1859 | out->v[0] = a->v[0] + a->v[2]; |
| 1860 | out->v[1] = a->v[1]; |
| 1861 | out->v[2] = -a->v[0] + a->v[2]; |
| 1862 | |
| 1863 | // s0, s1, s2 are added into out->v[0], out->v[1], and out->v[2], |
| 1864 | // respectively. We do not compute s1 because it's just -(s0 + s1). |
| 1865 | uint16_t s0 = 0, s2 = 0; |
| 1866 | for (size_t i = 3; i < 699; i += 3) { |
| 1867 | s0 += -a->v[i] + a->v[i + 2]; |
| 1868 | // s1 += a->v[i] - a->v[i + 1]; |
| 1869 | s2 += a->v[i + 1] - a->v[i + 2]; |
| 1870 | } |
| 1871 | |
| 1872 | // Handle the fact that the three-element pattern doesn't fill the |
| 1873 | // polynomial exactly (since 701 isn't a multiple of three). |
| 1874 | s0 -= a->v[699]; |
| 1875 | // s1 += a->v[699] - a->v[700]; |
| 1876 | s2 += a->v[700]; |
| 1877 | |
| 1878 | // Note that s0 + s1 + s2 = 0. |
| 1879 | out->v[0] += s0; |
| 1880 | out->v[1] -= (s0 + s2); // = s1 |
| 1881 | out->v[2] += s2; |
| 1882 | |
| 1883 | // Calculate the remaining inner products by taking advantage of the |
| 1884 | // fact that the pattern repeats every three cycles and the pattern of |
| 1885 | // differences moves with the rotation. |
| 1886 | for (size_t i = 3; i < N; i++) { |
| 1887 | out->v[i] = (out->v[i - 3] - (a->v[i - 2] + a->v[i - 1] + a->v[i])); |
| 1888 | } |
| 1889 | |
| 1890 | // Reduce mod Φ(N) by subtracting a multiple of out[700] from every |
| 1891 | // element and convert to mod Q. (See above about adding twice as |
| 1892 | // subtraction.) |
| 1893 | const crypto_word_t v = out->v[700]; |
| 1894 | for (unsigned i = 0; i < N; i++) { |
| 1895 | const uint16_t vi_mod3 = mod3(out->v[i] - v); |
| 1896 | // Map {0, 1, 2} to {0, 1, 0xffff}. |
| 1897 | out->v[i] = (~((vi_mod3 >> 1) - 1)) | vi_mod3; |
| 1898 | } |
| 1899 | |
| 1900 | poly_mul_x_minus_1(out); |
Adam Langley | f7e1a94 | 2022-04-01 11:46:32 -0700 | [diff] [blame] | 1901 | poly_normalize(out); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1902 | } |
| 1903 | |
| 1904 | struct public_key { |
| 1905 | struct poly ph; |
| 1906 | }; |
| 1907 | |
| 1908 | struct private_key { |
| 1909 | struct poly3 f, f_inverse; |
| 1910 | struct poly ph_inverse; |
| 1911 | uint8_t hmac_key[32]; |
| 1912 | }; |
| 1913 | |
| 1914 | // public_key_from_external converts an external public key pointer into an |
| 1915 | // internal one. Externally the alignment is only specified to be eight bytes |
| 1916 | // but we need 16-byte alignment. We could annotate the external struct with |
| 1917 | // that alignment but we can only assume that malloced pointers are 8-byte |
| 1918 | // aligned in any case. (Even if the underlying malloc returns values with |
| 1919 | // 16-byte alignment, |OPENSSL_malloc| will store an 8-byte size prefix and mess |
| 1920 | // that up.) |
| 1921 | static struct public_key *public_key_from_external( |
| 1922 | struct HRSS_public_key *ext) { |
David Benjamin | b7d6320 | 2022-07-26 13:25:02 -0700 | [diff] [blame] | 1923 | static_assert( |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1924 | sizeof(struct HRSS_public_key) >= sizeof(struct public_key) + 15, |
| 1925 | "HRSS public key too small"); |
| 1926 | |
David Benjamin | ecc301c | 2021-07-02 12:30:40 -0400 | [diff] [blame] | 1927 | return align_pointer(ext->opaque, 16); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1928 | } |
| 1929 | |
| 1930 | // private_key_from_external does the same thing as |public_key_from_external|, |
| 1931 | // but for private keys. See the comment on that function about alignment |
| 1932 | // issues. |
| 1933 | static struct private_key *private_key_from_external( |
| 1934 | struct HRSS_private_key *ext) { |
David Benjamin | b7d6320 | 2022-07-26 13:25:02 -0700 | [diff] [blame] | 1935 | static_assert( |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1936 | sizeof(struct HRSS_private_key) >= sizeof(struct private_key) + 15, |
| 1937 | "HRSS private key too small"); |
| 1938 | |
David Benjamin | ecc301c | 2021-07-02 12:30:40 -0400 | [diff] [blame] | 1939 | return align_pointer(ext->opaque, 16); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1940 | } |
| 1941 | |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 1942 | // malloc_align32 returns a pointer to |size| bytes of 32-byte-aligned heap and |
| 1943 | // sets |*out_ptr| to a value that can be passed to |OPENSSL_free| to release |
| 1944 | // it. It returns NULL if out of memory. |
| 1945 | static void *malloc_align32(void **out_ptr, size_t size) { |
| 1946 | void *ptr = OPENSSL_malloc(size + 31); |
| 1947 | if (!ptr) { |
| 1948 | *out_ptr = NULL; |
| 1949 | return NULL; |
| 1950 | } |
| 1951 | |
| 1952 | *out_ptr = ptr; |
| 1953 | return align_pointer(ptr, 32); |
| 1954 | } |
| 1955 | |
| 1956 | int HRSS_generate_key( |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1957 | struct HRSS_public_key *out_pub, struct HRSS_private_key *out_priv, |
| 1958 | const uint8_t in[HRSS_SAMPLE_BYTES + HRSS_SAMPLE_BYTES + 32]) { |
| 1959 | struct public_key *pub = public_key_from_external(out_pub); |
| 1960 | struct private_key *priv = private_key_from_external(out_priv); |
| 1961 | |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 1962 | struct vars { |
| 1963 | struct POLY_MUL_SCRATCH scratch; |
| 1964 | struct poly f; |
| 1965 | struct poly pg_phi1; |
| 1966 | struct poly pfg_phi1; |
| 1967 | struct poly pfg_phi1_inverse; |
| 1968 | }; |
| 1969 | |
| 1970 | void *malloc_ptr; |
| 1971 | struct vars *const vars = malloc_align32(&malloc_ptr, sizeof(struct vars)); |
| 1972 | if (!vars) { |
| 1973 | // If the caller ignores the return value the output will still be safe. |
| 1974 | // The private key output is randomised in case it's later passed to |
| 1975 | // |HRSS_encap|. |
| 1976 | memset(out_pub, 0, sizeof(struct HRSS_public_key)); |
| 1977 | RAND_bytes((uint8_t*) out_priv, sizeof(struct HRSS_private_key)); |
| 1978 | return 0; |
| 1979 | } |
| 1980 | |
Adam Langley | f7e1a94 | 2022-04-01 11:46:32 -0700 | [diff] [blame] | 1981 | #if !defined(NDEBUG) |
| 1982 | OPENSSL_memset(vars, 0xff, sizeof(struct vars)); |
| 1983 | #endif |
| 1984 | |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1985 | OPENSSL_memcpy(priv->hmac_key, in + 2 * HRSS_SAMPLE_BYTES, |
| 1986 | sizeof(priv->hmac_key)); |
| 1987 | |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 1988 | poly_short_sample_plus(&vars->f, in); |
| 1989 | poly3_from_poly(&priv->f, &vars->f); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1990 | HRSS_poly3_invert(&priv->f_inverse, &priv->f); |
| 1991 | |
| 1992 | // pg_phi1 is p (i.e. 3) × g × Φ(1) (i.e. 𝑥-1). |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 1993 | poly_short_sample_plus(&vars->pg_phi1, in + HRSS_SAMPLE_BYTES); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1994 | for (unsigned i = 0; i < N; i++) { |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 1995 | vars->pg_phi1.v[i] *= 3; |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1996 | } |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 1997 | poly_mul_x_minus_1(&vars->pg_phi1); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 1998 | |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 1999 | poly_mul(&vars->scratch, &vars->pfg_phi1, &vars->f, &vars->pg_phi1); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2000 | |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2001 | poly_invert(&vars->scratch, &vars->pfg_phi1_inverse, &vars->pfg_phi1); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2002 | |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2003 | poly_mul(&vars->scratch, &pub->ph, &vars->pfg_phi1_inverse, &vars->pg_phi1); |
| 2004 | poly_mul(&vars->scratch, &pub->ph, &pub->ph, &vars->pg_phi1); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2005 | poly_clamp(&pub->ph); |
| 2006 | |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2007 | poly_mul(&vars->scratch, &priv->ph_inverse, &vars->pfg_phi1_inverse, |
| 2008 | &vars->f); |
| 2009 | poly_mul(&vars->scratch, &priv->ph_inverse, &priv->ph_inverse, &vars->f); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2010 | poly_clamp(&priv->ph_inverse); |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2011 | |
| 2012 | OPENSSL_free(malloc_ptr); |
| 2013 | return 1; |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2014 | } |
| 2015 | |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2016 | static const char kSharedKey[] = "shared key"; |
| 2017 | |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2018 | int HRSS_encap(uint8_t out_ciphertext[POLY_BYTES], uint8_t out_shared_key[32], |
| 2019 | const struct HRSS_public_key *in_pub, |
| 2020 | const uint8_t in[HRSS_SAMPLE_BYTES + HRSS_SAMPLE_BYTES]) { |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2021 | const struct public_key *pub = |
| 2022 | public_key_from_external((struct HRSS_public_key *)in_pub); |
Adam Langley | 9700b44 | 2018-12-17 11:17:26 -0800 | [diff] [blame] | 2023 | |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2024 | struct vars { |
| 2025 | struct POLY_MUL_SCRATCH scratch; |
| 2026 | struct poly m, r, m_lifted; |
| 2027 | struct poly prh_plus_m; |
| 2028 | SHA256_CTX hash_ctx; |
| 2029 | uint8_t m_bytes[HRSS_POLY3_BYTES]; |
| 2030 | uint8_t r_bytes[HRSS_POLY3_BYTES]; |
| 2031 | }; |
| 2032 | |
| 2033 | void *malloc_ptr; |
| 2034 | struct vars *const vars = malloc_align32(&malloc_ptr, sizeof(struct vars)); |
| 2035 | if (!vars) { |
| 2036 | // If the caller ignores the return value the output will still be safe. |
| 2037 | // The private key output is randomised in case it's used to encrypt and |
| 2038 | // transmit something. |
| 2039 | memset(out_ciphertext, 0, POLY_BYTES); |
| 2040 | RAND_bytes(out_shared_key, 32); |
| 2041 | return 0; |
Adam Langley | 9700b44 | 2018-12-17 11:17:26 -0800 | [diff] [blame] | 2042 | } |
| 2043 | |
Adam Langley | f7e1a94 | 2022-04-01 11:46:32 -0700 | [diff] [blame] | 2044 | #if !defined(NDEBUG) |
| 2045 | OPENSSL_memset(vars, 0xff, sizeof(struct vars)); |
| 2046 | #endif |
| 2047 | |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2048 | poly_short_sample(&vars->m, in); |
| 2049 | poly_short_sample(&vars->r, in + HRSS_SAMPLE_BYTES); |
| 2050 | poly_lift(&vars->m_lifted, &vars->m); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2051 | |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2052 | poly_mul(&vars->scratch, &vars->prh_plus_m, &vars->r, &pub->ph); |
| 2053 | for (unsigned i = 0; i < N; i++) { |
| 2054 | vars->prh_plus_m.v[i] += vars->m_lifted.v[i]; |
| 2055 | } |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2056 | |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2057 | poly_marshal(out_ciphertext, &vars->prh_plus_m); |
| 2058 | |
| 2059 | poly_marshal_mod3(vars->m_bytes, &vars->m); |
| 2060 | poly_marshal_mod3(vars->r_bytes, &vars->r); |
| 2061 | |
| 2062 | SHA256_Init(&vars->hash_ctx); |
| 2063 | SHA256_Update(&vars->hash_ctx, kSharedKey, sizeof(kSharedKey)); |
| 2064 | SHA256_Update(&vars->hash_ctx, vars->m_bytes, sizeof(vars->m_bytes)); |
| 2065 | SHA256_Update(&vars->hash_ctx, vars->r_bytes, sizeof(vars->r_bytes)); |
| 2066 | SHA256_Update(&vars->hash_ctx, out_ciphertext, POLY_BYTES); |
| 2067 | SHA256_Final(out_shared_key, &vars->hash_ctx); |
| 2068 | |
| 2069 | OPENSSL_free(malloc_ptr); |
| 2070 | return 1; |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2071 | } |
| 2072 | |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2073 | int HRSS_decap(uint8_t out_shared_key[HRSS_KEY_BYTES], |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2074 | const struct HRSS_private_key *in_priv, |
| 2075 | const uint8_t *ciphertext, size_t ciphertext_len) { |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2076 | const struct private_key *priv = |
| 2077 | private_key_from_external((struct HRSS_private_key *)in_priv); |
| 2078 | |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2079 | struct vars { |
| 2080 | struct POLY_MUL_SCRATCH scratch; |
| 2081 | uint8_t masked_key[SHA256_CBLOCK]; |
| 2082 | SHA256_CTX hash_ctx; |
| 2083 | struct poly c; |
| 2084 | struct poly f, cf; |
| 2085 | struct poly3 cf3, m3; |
| 2086 | struct poly m, m_lifted; |
| 2087 | struct poly r; |
| 2088 | struct poly3 r3; |
| 2089 | uint8_t expected_ciphertext[HRSS_CIPHERTEXT_BYTES]; |
| 2090 | uint8_t m_bytes[HRSS_POLY3_BYTES]; |
| 2091 | uint8_t r_bytes[HRSS_POLY3_BYTES]; |
| 2092 | uint8_t shared_key[32]; |
| 2093 | }; |
| 2094 | |
| 2095 | void *malloc_ptr; |
| 2096 | struct vars *const vars = malloc_align32(&malloc_ptr, sizeof(struct vars)); |
| 2097 | if (!vars) { |
| 2098 | // If the caller ignores the return value the output will still be safe. |
| 2099 | // The private key output is randomised in case it's used to encrypt and |
| 2100 | // transmit something. |
| 2101 | RAND_bytes(out_shared_key, HRSS_KEY_BYTES); |
| 2102 | return 0; |
| 2103 | } |
| 2104 | |
Adam Langley | f7e1a94 | 2022-04-01 11:46:32 -0700 | [diff] [blame] | 2105 | #if !defined(NDEBUG) |
| 2106 | OPENSSL_memset(vars, 0xff, sizeof(struct vars)); |
| 2107 | #endif |
| 2108 | |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2109 | // This is HMAC, expanded inline rather than using the |HMAC| function so that |
| 2110 | // we can avoid dealing with possible allocation failures and so keep this |
| 2111 | // function infallible. |
David Benjamin | b7d6320 | 2022-07-26 13:25:02 -0700 | [diff] [blame] | 2112 | static_assert(sizeof(priv->hmac_key) <= sizeof(vars->masked_key), |
| 2113 | "HRSS HMAC key larger than SHA-256 block size"); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2114 | for (size_t i = 0; i < sizeof(priv->hmac_key); i++) { |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2115 | vars->masked_key[i] = priv->hmac_key[i] ^ 0x36; |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2116 | } |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2117 | OPENSSL_memset(vars->masked_key + sizeof(priv->hmac_key), 0x36, |
| 2118 | sizeof(vars->masked_key) - sizeof(priv->hmac_key)); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2119 | |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2120 | SHA256_Init(&vars->hash_ctx); |
| 2121 | SHA256_Update(&vars->hash_ctx, vars->masked_key, sizeof(vars->masked_key)); |
| 2122 | SHA256_Update(&vars->hash_ctx, ciphertext, ciphertext_len); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2123 | uint8_t inner_digest[SHA256_DIGEST_LENGTH]; |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2124 | SHA256_Final(inner_digest, &vars->hash_ctx); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2125 | |
| 2126 | for (size_t i = 0; i < sizeof(priv->hmac_key); i++) { |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2127 | vars->masked_key[i] ^= (0x5c ^ 0x36); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2128 | } |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2129 | OPENSSL_memset(vars->masked_key + sizeof(priv->hmac_key), 0x5c, |
| 2130 | sizeof(vars->masked_key) - sizeof(priv->hmac_key)); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2131 | |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2132 | SHA256_Init(&vars->hash_ctx); |
| 2133 | SHA256_Update(&vars->hash_ctx, vars->masked_key, sizeof(vars->masked_key)); |
| 2134 | SHA256_Update(&vars->hash_ctx, inner_digest, sizeof(inner_digest)); |
David Benjamin | b7d6320 | 2022-07-26 13:25:02 -0700 | [diff] [blame] | 2135 | static_assert(HRSS_KEY_BYTES == SHA256_DIGEST_LENGTH, |
| 2136 | "HRSS shared key length incorrect"); |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2137 | SHA256_Final(out_shared_key, &vars->hash_ctx); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2138 | |
| 2139 | // If the ciphertext is publicly invalid then a random shared key is still |
| 2140 | // returned to simply the logic of the caller, but this path is not constant |
| 2141 | // time. |
Adam Langley | f8068ce | 2018-12-17 11:16:23 -0800 | [diff] [blame] | 2142 | if (ciphertext_len != HRSS_CIPHERTEXT_BYTES || |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2143 | !poly_unmarshal(&vars->c, ciphertext)) { |
| 2144 | goto out; |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2145 | } |
| 2146 | |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2147 | poly_from_poly3(&vars->f, &priv->f); |
| 2148 | poly_mul(&vars->scratch, &vars->cf, &vars->c, &vars->f); |
| 2149 | poly3_from_poly(&vars->cf3, &vars->cf); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2150 | // Note that cf3 is not reduced mod Φ(N). That reduction is deferred. |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2151 | HRSS_poly3_mul(&vars->m3, &vars->cf3, &priv->f_inverse); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2152 | |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2153 | poly_from_poly3(&vars->m, &vars->m3); |
| 2154 | poly_lift(&vars->m_lifted, &vars->m); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2155 | |
| 2156 | for (unsigned i = 0; i < N; i++) { |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2157 | vars->r.v[i] = vars->c.v[i] - vars->m_lifted.v[i]; |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2158 | } |
Adam Langley | f7e1a94 | 2022-04-01 11:46:32 -0700 | [diff] [blame] | 2159 | poly_normalize(&vars->r); |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2160 | poly_mul(&vars->scratch, &vars->r, &vars->r, &priv->ph_inverse); |
| 2161 | poly_mod_phiN(&vars->r); |
| 2162 | poly_clamp(&vars->r); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2163 | |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2164 | crypto_word_t ok = poly3_from_poly_checked(&vars->r3, &vars->r); |
Adam Langley | 9700b44 | 2018-12-17 11:17:26 -0800 | [diff] [blame] | 2165 | |
| 2166 | // [NTRUCOMP] section 5.1 includes ReEnc2 and a proof that it's valid. Rather |
| 2167 | // than do an expensive |poly_mul|, it rebuilds |c'| from |c - lift(m)| |
| 2168 | // (called |b|) with: |
| 2169 | // t = (−b(1)/N) mod Q |
| 2170 | // c' = b + tΦ(N) + lift(m) mod Q |
| 2171 | // |
| 2172 | // When polynomials are transmitted, the final coefficient is omitted and |
| 2173 | // |poly_unmarshal| sets it such that f(1) == 0. Thus c(1) == 0. Also, |
| 2174 | // |poly_lift| multiplies the result by (x-1) and therefore evaluating a |
| 2175 | // lifted polynomial at 1 is also zero. Thus lift(m)(1) == 0 and so |
| 2176 | // (c - lift(m))(1) == 0. |
| 2177 | // |
| 2178 | // Although we defer the reduction above, |b| is conceptually reduced mod |
| 2179 | // Φ(N). In order to do that reduction one subtracts |c[N-1]| from every |
| 2180 | // coefficient. Therefore b(1) = -c[N-1]×N. The value of |t|, above, then is |
| 2181 | // just recovering |c[N-1]|, and adding tΦ(N) is simply undoing the reduction. |
| 2182 | // Therefore b + tΦ(N) + lift(m) = c by construction and we don't need to |
| 2183 | // recover |c| at all so long as we do the checks in |
| 2184 | // |poly3_from_poly_checked|. |
| 2185 | // |
| 2186 | // The |poly_marshal| here then is just confirming that |poly_unmarshal| is |
| 2187 | // strict and could be omitted. |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2188 | |
David Benjamin | b7d6320 | 2022-07-26 13:25:02 -0700 | [diff] [blame] | 2189 | static_assert(HRSS_CIPHERTEXT_BYTES == POLY_BYTES, |
| 2190 | "ciphertext is the wrong size"); |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2191 | assert(ciphertext_len == sizeof(vars->expected_ciphertext)); |
| 2192 | poly_marshal(vars->expected_ciphertext, &vars->c); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2193 | |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2194 | poly_marshal_mod3(vars->m_bytes, &vars->m); |
| 2195 | poly_marshal_mod3(vars->r_bytes, &vars->r); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2196 | |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2197 | ok &= constant_time_is_zero_w( |
| 2198 | CRYPTO_memcmp(ciphertext, vars->expected_ciphertext, |
| 2199 | sizeof(vars->expected_ciphertext))); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2200 | |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2201 | SHA256_Init(&vars->hash_ctx); |
| 2202 | SHA256_Update(&vars->hash_ctx, kSharedKey, sizeof(kSharedKey)); |
| 2203 | SHA256_Update(&vars->hash_ctx, vars->m_bytes, sizeof(vars->m_bytes)); |
| 2204 | SHA256_Update(&vars->hash_ctx, vars->r_bytes, sizeof(vars->r_bytes)); |
| 2205 | SHA256_Update(&vars->hash_ctx, vars->expected_ciphertext, |
| 2206 | sizeof(vars->expected_ciphertext)); |
| 2207 | SHA256_Final(vars->shared_key, &vars->hash_ctx); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2208 | |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2209 | for (unsigned i = 0; i < sizeof(vars->shared_key); i++) { |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2210 | out_shared_key[i] = |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2211 | constant_time_select_8(ok, vars->shared_key[i], out_shared_key[i]); |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2212 | } |
Adam Langley | 7153013 | 2021-06-30 16:33:47 -0700 | [diff] [blame] | 2213 | |
| 2214 | out: |
| 2215 | OPENSSL_free(malloc_ptr); |
| 2216 | return 1; |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2217 | } |
| 2218 | |
| 2219 | void HRSS_marshal_public_key(uint8_t out[HRSS_PUBLIC_KEY_BYTES], |
| 2220 | const struct HRSS_public_key *in_pub) { |
| 2221 | const struct public_key *pub = |
| 2222 | public_key_from_external((struct HRSS_public_key *)in_pub); |
| 2223 | poly_marshal(out, &pub->ph); |
| 2224 | } |
| 2225 | |
| 2226 | int HRSS_parse_public_key(struct HRSS_public_key *out, |
| 2227 | const uint8_t in[HRSS_PUBLIC_KEY_BYTES]) { |
| 2228 | struct public_key *pub = public_key_from_external(out); |
Adam Langley | f8068ce | 2018-12-17 11:16:23 -0800 | [diff] [blame] | 2229 | if (!poly_unmarshal(&pub->ph, in)) { |
| 2230 | return 0; |
| 2231 | } |
Adam Langley | 7b93593 | 2018-11-12 13:53:42 -0800 | [diff] [blame] | 2232 | OPENSSL_memset(&pub->ph.v[N], 0, 3 * sizeof(uint16_t)); |
| 2233 | return 1; |
| 2234 | } |