blob: 572e9817c923a6738aec3c28f1cc424cf224df23 [file] [log] [blame]
Adam Langley7b935932018-11-12 13:53:42 -08001/* Copyright (c) 2018, Google Inc.
2 *
3 * Permission to use, copy, modify, and/or distribute this software for any
4 * purpose with or without fee is hereby granted, provided that the above
5 * copyright notice and this permission notice appear in all copies.
6 *
7 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10 * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14
15#include <openssl/hrss.h>
16
17#include <assert.h>
18#include <stdio.h>
19#include <stdlib.h>
20
21#include <openssl/bn.h>
Adam Langley7b935932018-11-12 13:53:42 -080022#include <openssl/hmac.h>
23#include <openssl/mem.h>
Adam Langley71530132021-06-30 16:33:47 -070024#include <openssl/rand.h>
Adam Langley7b935932018-11-12 13:53:42 -080025#include <openssl/sha.h>
26
Adam Langley7b935932018-11-12 13:53:42 -080027#if defined(_MSC_VER)
28#define RESTRICT
29#else
30#define RESTRICT restrict
31#endif
32
33#include "../internal.h"
34#include "internal.h"
35
David Benjamin6887d5e2019-12-17 13:56:30 -050036#if defined(OPENSSL_SSE2)
37#include <emmintrin.h>
38#endif
39
David Benjamin846a2272022-01-06 11:12:06 -050040#if (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && defined(__ARM_NEON)
David Benjamin6887d5e2019-12-17 13:56:30 -050041#include <arm_neon.h>
42#endif
43
Adam Langley7b935932018-11-12 13:53:42 -080044// This is an implementation of [HRSS], but with a KEM transformation based on
45// [SXY]. The primary references are:
46
47// HRSS: https://eprint.iacr.org/2017/667.pdf
48// HRSSNIST:
49// https://csrc.nist.gov/CSRC/media/Projects/Post-Quantum-Cryptography/documents/round-1/submissions/NTRU_HRSS_KEM.zip
50// SXY: https://eprint.iacr.org/2017/1005.pdf
51// NTRUTN14:
52// https://assets.onboardsecurity.com/static/downloads/NTRU/resources/NTRUTech014.pdf
Adam Langleycbae9652019-12-06 14:58:52 -080053// NTRUCOMP: https://eprint.iacr.org/2018/1174
54// SAFEGCD: https://gcd.cr.yp.to/papers.html#safegcd
Adam Langley7b935932018-11-12 13:53:42 -080055
56
57// Vector operations.
58//
59// A couple of functions in this file can use vector operations to meaningful
60// effect. If we're building for a target that has a supported vector unit,
61// |HRSS_HAVE_VECTOR_UNIT| will be defined and |vec_t| will be typedefed to a
62// 128-bit vector. The following functions abstract over the differences between
63// NEON and SSE2 for implementing some vector operations.
64
David Benjamin6887d5e2019-12-17 13:56:30 -050065// TODO: MSVC can likely also be made to work with vector operations, but ^ must
66// be replaced with _mm_xor_si128, etc.
67#if defined(OPENSSL_SSE2) && (defined(__clang__) || !defined(_MSC_VER))
Adam Langley7b935932018-11-12 13:53:42 -080068
69#define HRSS_HAVE_VECTOR_UNIT
70typedef __m128i vec_t;
71
72// vec_capable returns one iff the current platform supports SSE2.
David Benjamin6887d5e2019-12-17 13:56:30 -050073static int vec_capable(void) { return 1; }
Adam Langley7b935932018-11-12 13:53:42 -080074
75// vec_add performs a pair-wise addition of four uint16s from |a| and |b|.
76static inline vec_t vec_add(vec_t a, vec_t b) { return _mm_add_epi16(a, b); }
77
78// vec_sub performs a pair-wise subtraction of four uint16s from |a| and |b|.
79static inline vec_t vec_sub(vec_t a, vec_t b) { return _mm_sub_epi16(a, b); }
80
81// vec_mul multiplies each uint16_t in |a| by |b| and returns the resulting
82// vector.
83static inline vec_t vec_mul(vec_t a, uint16_t b) {
84 return _mm_mullo_epi16(a, _mm_set1_epi16(b));
85}
86
87// vec_fma multiplies each uint16_t in |b| by |c|, adds the result to |a|, and
88// returns the resulting vector.
89static inline vec_t vec_fma(vec_t a, vec_t b, uint16_t c) {
90 return _mm_add_epi16(a, _mm_mullo_epi16(b, _mm_set1_epi16(c)));
91}
92
93// vec3_rshift_word right-shifts the 24 uint16_t's in |v| by one uint16.
94static inline void vec3_rshift_word(vec_t v[3]) {
95 // Intel's left and right shifting is backwards compared to the order in
96 // memory because they're based on little-endian order of words (and not just
97 // bytes). So the shifts in this function will be backwards from what one
98 // might expect.
99 const __m128i carry0 = _mm_srli_si128(v[0], 14);
100 v[0] = _mm_slli_si128(v[0], 2);
101
102 const __m128i carry1 = _mm_srli_si128(v[1], 14);
103 v[1] = _mm_slli_si128(v[1], 2);
104 v[1] |= carry0;
105
106 v[2] = _mm_slli_si128(v[2], 2);
107 v[2] |= carry1;
108}
109
110// vec4_rshift_word right-shifts the 32 uint16_t's in |v| by one uint16.
111static inline void vec4_rshift_word(vec_t v[4]) {
112 // Intel's left and right shifting is backwards compared to the order in
113 // memory because they're based on little-endian order of words (and not just
114 // bytes). So the shifts in this function will be backwards from what one
115 // might expect.
116 const __m128i carry0 = _mm_srli_si128(v[0], 14);
117 v[0] = _mm_slli_si128(v[0], 2);
118
119 const __m128i carry1 = _mm_srli_si128(v[1], 14);
120 v[1] = _mm_slli_si128(v[1], 2);
121 v[1] |= carry0;
122
123 const __m128i carry2 = _mm_srli_si128(v[2], 14);
124 v[2] = _mm_slli_si128(v[2], 2);
125 v[2] |= carry1;
126
127 v[3] = _mm_slli_si128(v[3], 2);
128 v[3] |= carry2;
129}
130
131// vec_merge_3_5 takes the final three uint16_t's from |left|, appends the first
132// five from |right|, and returns the resulting vector.
133static inline vec_t vec_merge_3_5(vec_t left, vec_t right) {
134 return _mm_srli_si128(left, 10) | _mm_slli_si128(right, 6);
135}
136
137// poly3_vec_lshift1 left-shifts the 768 bits in |a_s|, and in |a_a|, by one
138// bit.
139static inline void poly3_vec_lshift1(vec_t a_s[6], vec_t a_a[6]) {
140 vec_t carry_s = {0};
141 vec_t carry_a = {0};
142
143 for (int i = 0; i < 6; i++) {
144 vec_t next_carry_s = _mm_srli_epi64(a_s[i], 63);
145 a_s[i] = _mm_slli_epi64(a_s[i], 1);
146 a_s[i] |= _mm_slli_si128(next_carry_s, 8);
147 a_s[i] |= carry_s;
148 carry_s = _mm_srli_si128(next_carry_s, 8);
149
150 vec_t next_carry_a = _mm_srli_epi64(a_a[i], 63);
151 a_a[i] = _mm_slli_epi64(a_a[i], 1);
152 a_a[i] |= _mm_slli_si128(next_carry_a, 8);
153 a_a[i] |= carry_a;
154 carry_a = _mm_srli_si128(next_carry_a, 8);
155 }
156}
157
158// poly3_vec_rshift1 right-shifts the 768 bits in |a_s|, and in |a_a|, by one
159// bit.
160static inline void poly3_vec_rshift1(vec_t a_s[6], vec_t a_a[6]) {
161 vec_t carry_s = {0};
162 vec_t carry_a = {0};
163
164 for (int i = 5; i >= 0; i--) {
165 const vec_t next_carry_s = _mm_slli_epi64(a_s[i], 63);
166 a_s[i] = _mm_srli_epi64(a_s[i], 1);
167 a_s[i] |= _mm_srli_si128(next_carry_s, 8);
168 a_s[i] |= carry_s;
169 carry_s = _mm_slli_si128(next_carry_s, 8);
170
171 const vec_t next_carry_a = _mm_slli_epi64(a_a[i], 63);
172 a_a[i] = _mm_srli_epi64(a_a[i], 1);
173 a_a[i] |= _mm_srli_si128(next_carry_a, 8);
174 a_a[i] |= carry_a;
175 carry_a = _mm_slli_si128(next_carry_a, 8);
176 }
177}
178
179// vec_broadcast_bit duplicates the least-significant bit in |a| to all bits in
180// a vector and returns the result.
181static inline vec_t vec_broadcast_bit(vec_t a) {
182 return _mm_shuffle_epi32(_mm_srai_epi32(_mm_slli_epi64(a, 63), 31),
183 0b01010101);
184}
185
Adam Langley7b935932018-11-12 13:53:42 -0800186// vec_get_word returns the |i|th uint16_t in |v|. (This is a macro because the
187// compiler requires that |i| be a compile-time constant.)
188#define vec_get_word(v, i) _mm_extract_epi16(v, i)
189
David Benjamin846a2272022-01-06 11:12:06 -0500190#elif (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && defined(__ARM_NEON)
Adam Langley7b935932018-11-12 13:53:42 -0800191
192#define HRSS_HAVE_VECTOR_UNIT
193typedef uint16x8_t vec_t;
194
195// These functions perform the same actions as the SSE2 function of the same
196// name, above.
197
Alessandro Ghedini9b0970f2018-12-14 16:43:36 +0000198static int vec_capable(void) { return CRYPTO_is_NEON_capable(); }
Adam Langley7b935932018-11-12 13:53:42 -0800199
200static inline vec_t vec_add(vec_t a, vec_t b) { return a + b; }
201
202static inline vec_t vec_sub(vec_t a, vec_t b) { return a - b; }
203
204static inline vec_t vec_mul(vec_t a, uint16_t b) { return vmulq_n_u16(a, b); }
205
206static inline vec_t vec_fma(vec_t a, vec_t b, uint16_t c) {
207 return vmlaq_n_u16(a, b, c);
208}
209
210static inline void vec3_rshift_word(vec_t v[3]) {
211 const uint16x8_t kZero = {0};
212 v[2] = vextq_u16(v[1], v[2], 7);
213 v[1] = vextq_u16(v[0], v[1], 7);
214 v[0] = vextq_u16(kZero, v[0], 7);
215}
216
217static inline void vec4_rshift_word(vec_t v[4]) {
218 const uint16x8_t kZero = {0};
219 v[3] = vextq_u16(v[2], v[3], 7);
220 v[2] = vextq_u16(v[1], v[2], 7);
221 v[1] = vextq_u16(v[0], v[1], 7);
222 v[0] = vextq_u16(kZero, v[0], 7);
223}
224
225static inline vec_t vec_merge_3_5(vec_t left, vec_t right) {
226 return vextq_u16(left, right, 5);
227}
228
229static inline uint16_t vec_get_word(vec_t v, unsigned i) {
230 return v[i];
231}
232
233#if !defined(OPENSSL_AARCH64)
234
235static inline vec_t vec_broadcast_bit(vec_t a) {
236 a = (vec_t)vshrq_n_s16(((int16x8_t)a) << 15, 15);
237 return vdupq_lane_u16(vget_low_u16(a), 0);
238}
239
Adam Langley7b935932018-11-12 13:53:42 -0800240static inline void poly3_vec_lshift1(vec_t a_s[6], vec_t a_a[6]) {
241 vec_t carry_s = {0};
242 vec_t carry_a = {0};
243 const vec_t kZero = {0};
244
245 for (int i = 0; i < 6; i++) {
246 vec_t next_carry_s = a_s[i] >> 15;
247 a_s[i] <<= 1;
248 a_s[i] |= vextq_u16(kZero, next_carry_s, 7);
249 a_s[i] |= carry_s;
250 carry_s = vextq_u16(next_carry_s, kZero, 7);
251
252 vec_t next_carry_a = a_a[i] >> 15;
253 a_a[i] <<= 1;
254 a_a[i] |= vextq_u16(kZero, next_carry_a, 7);
255 a_a[i] |= carry_a;
256 carry_a = vextq_u16(next_carry_a, kZero, 7);
257 }
258}
259
260static inline void poly3_vec_rshift1(vec_t a_s[6], vec_t a_a[6]) {
261 vec_t carry_s = {0};
262 vec_t carry_a = {0};
263 const vec_t kZero = {0};
264
265 for (int i = 5; i >= 0; i--) {
266 vec_t next_carry_s = a_s[i] << 15;
267 a_s[i] >>= 1;
268 a_s[i] |= vextq_u16(next_carry_s, kZero, 1);
269 a_s[i] |= carry_s;
270 carry_s = vextq_u16(kZero, next_carry_s, 1);
271
272 vec_t next_carry_a = a_a[i] << 15;
273 a_a[i] >>= 1;
274 a_a[i] |= vextq_u16(next_carry_a, kZero, 1);
275 a_a[i] |= carry_a;
276 carry_a = vextq_u16(kZero, next_carry_a, 1);
277 }
278}
279
280#endif // !OPENSSL_AARCH64
281
282#endif // (ARM || AARCH64) && NEON
283
284// Polynomials in this scheme have N terms.
285// #define N 701
286
287// Underlying data types and arithmetic operations.
288// ------------------------------------------------
289
290// Binary polynomials.
291
292// poly2 represents a degree-N polynomial over GF(2). The words are in little-
293// endian order, i.e. the coefficient of x^0 is the LSB of the first word. The
294// final word is only partially used since N is not a multiple of the word size.
295
296// Defined in internal.h:
297// struct poly2 {
298// crypto_word_t v[WORDS_PER_POLY];
299// };
300
301OPENSSL_UNUSED static void hexdump(const void *void_in, size_t len) {
302 const uint8_t *in = (const uint8_t *)void_in;
303 for (size_t i = 0; i < len; i++) {
304 printf("%02x", in[i]);
305 }
306 printf("\n");
307}
308
309static void poly2_zero(struct poly2 *p) {
310 OPENSSL_memset(&p->v[0], 0, sizeof(crypto_word_t) * WORDS_PER_POLY);
311}
312
Adam Langleycbae9652019-12-06 14:58:52 -0800313// word_reverse returns |in| with the bits in reverse order.
314static crypto_word_t word_reverse(crypto_word_t in) {
315#if defined(OPENSSL_64_BIT)
316 static const crypto_word_t kMasks[6] = {
317 UINT64_C(0x5555555555555555),
318 UINT64_C(0x3333333333333333),
319 UINT64_C(0x0f0f0f0f0f0f0f0f),
320 UINT64_C(0x00ff00ff00ff00ff),
321 UINT64_C(0x0000ffff0000ffff),
322 UINT64_C(0x00000000ffffffff),
323 };
324#else
325 static const crypto_word_t kMasks[5] = {
326 0x55555555,
327 0x33333333,
328 0x0f0f0f0f,
329 0x00ff00ff,
330 0x0000ffff,
331 };
332#endif
333
334 for (size_t i = 0; i < OPENSSL_ARRAY_SIZE(kMasks); i++) {
335 in = ((in >> (1 << i)) & kMasks[i]) | ((in & kMasks[i]) << (1 << i));
336 }
337
338 return in;
339}
340
341// lsb_to_all replicates the least-significant bit of |v| to all bits of the
342// word. This is used in bit-slicing operations to make a vector from a fixed
343// value.
344static crypto_word_t lsb_to_all(crypto_word_t v) { return 0u - (v & 1); }
345
346// poly2_mod_phiN reduces |p| by Φ(N).
347static void poly2_mod_phiN(struct poly2 *p) {
348 // m is the term at x^700, replicated to every bit.
349 const crypto_word_t m =
350 lsb_to_all(p->v[WORDS_PER_POLY - 1] >> (BITS_IN_LAST_WORD - 1));
Adam Langley7b935932018-11-12 13:53:42 -0800351 for (size_t i = 0; i < WORDS_PER_POLY; i++) {
Adam Langleycbae9652019-12-06 14:58:52 -0800352 p->v[i] ^= m;
Adam Langley7b935932018-11-12 13:53:42 -0800353 }
Adam Langleycbae9652019-12-06 14:58:52 -0800354 p->v[WORDS_PER_POLY - 1] &= (UINT64_C(1) << (BITS_IN_LAST_WORD - 1)) - 1;
Adam Langley7b935932018-11-12 13:53:42 -0800355}
356
Adam Langleycbae9652019-12-06 14:58:52 -0800357// poly2_reverse_700 reverses the order of the first 700 bits of |in| and writes
358// the result to |out|.
359static void poly2_reverse_700(struct poly2 *out, const struct poly2 *in) {
360 struct poly2 t;
361 for (size_t i = 0; i < WORDS_PER_POLY; i++) {
362 t.v[i] = word_reverse(in->v[i]);
Adam Langley7b935932018-11-12 13:53:42 -0800363 }
364
Adam Langleycbae9652019-12-06 14:58:52 -0800365 static const size_t shift = BITS_PER_WORD - ((N-1) % BITS_PER_WORD);
366 for (size_t i = 0; i < WORDS_PER_POLY-1; i++) {
367 out->v[i] = t.v[WORDS_PER_POLY-1-i] >> shift;
368 out->v[i] |= t.v[WORDS_PER_POLY-2-i] << (BITS_PER_WORD - shift);
Adam Langley7b935932018-11-12 13:53:42 -0800369 }
Adam Langleycbae9652019-12-06 14:58:52 -0800370 out->v[WORDS_PER_POLY-1] = t.v[0] >> shift;
Adam Langley7b935932018-11-12 13:53:42 -0800371}
372
373// poly2_cswap exchanges the values of |a| and |b| if |swap| is all ones.
374static void poly2_cswap(struct poly2 *a, struct poly2 *b, crypto_word_t swap) {
375 for (size_t i = 0; i < WORDS_PER_POLY; i++) {
376 const crypto_word_t sum = swap & (a->v[i] ^ b->v[i]);
377 a->v[i] ^= sum;
378 b->v[i] ^= sum;
379 }
380}
381
382// poly2_fmadd sets |out| to |out| + |in| * m, where m is either
383// |CONSTTIME_TRUE_W| or |CONSTTIME_FALSE_W|.
384static void poly2_fmadd(struct poly2 *out, const struct poly2 *in,
385 crypto_word_t m) {
386 for (size_t i = 0; i < WORDS_PER_POLY; i++) {
387 out->v[i] ^= in->v[i] & m;
388 }
389}
390
391// poly2_lshift1 left-shifts |p| by one bit.
392static void poly2_lshift1(struct poly2 *p) {
393 crypto_word_t carry = 0;
394 for (size_t i = 0; i < WORDS_PER_POLY; i++) {
395 const crypto_word_t next_carry = p->v[i] >> (BITS_PER_WORD - 1);
396 p->v[i] <<= 1;
397 p->v[i] |= carry;
398 carry = next_carry;
399 }
400}
401
402// poly2_rshift1 right-shifts |p| by one bit.
403static void poly2_rshift1(struct poly2 *p) {
404 crypto_word_t carry = 0;
405 for (size_t i = WORDS_PER_POLY - 1; i < WORDS_PER_POLY; i--) {
406 const crypto_word_t next_carry = p->v[i] & 1;
407 p->v[i] >>= 1;
408 p->v[i] |= carry << (BITS_PER_WORD - 1);
409 carry = next_carry;
410 }
411}
412
413// poly2_clear_top_bits clears the bits in the final word that are only for
414// alignment.
415static void poly2_clear_top_bits(struct poly2 *p) {
416 p->v[WORDS_PER_POLY - 1] &= (UINT64_C(1) << BITS_IN_LAST_WORD) - 1;
417}
418
419// poly2_top_bits_are_clear returns one iff the extra bits in the final words of
420// |p| are zero.
421static int poly2_top_bits_are_clear(const struct poly2 *p) {
422 return (p->v[WORDS_PER_POLY - 1] &
423 ~((UINT64_C(1) << BITS_IN_LAST_WORD) - 1)) == 0;
424}
425
426// Ternary polynomials.
427
428// poly3 represents a degree-N polynomial over GF(3). Each coefficient is
429// bitsliced across the |s| and |a| arrays, like this:
430//
431// s | a | value
432// -----------------
433// 0 | 0 | 0
434// 0 | 1 | 1
Adam Langleyeadef472019-01-16 17:20:24 -0800435// 1 | 1 | -1 (aka 2)
436// 1 | 0 | <invalid>
Adam Langley7b935932018-11-12 13:53:42 -0800437//
Adam Langleyeadef472019-01-16 17:20:24 -0800438// ('s' is for sign, and 'a' is the absolute value.)
Adam Langley7b935932018-11-12 13:53:42 -0800439//
440// Once bitsliced as such, the following circuits can be used to implement
441// addition and multiplication mod 3:
442//
443// (s3, a3) = (s1, a1) × (s2, a2)
Adam Langleyeadef472019-01-16 17:20:24 -0800444// a3 = a1 ∧ a2
445// s3 = (s1 ⊕ s2) ∧ a3
Adam Langley7b935932018-11-12 13:53:42 -0800446//
447// (s3, a3) = (s1, a1) + (s2, a2)
Adam Langleyeadef472019-01-16 17:20:24 -0800448// t = s1 ⊕ a2
449// s3 = t ∧ (s2 ⊕ a1)
450// a3 = (a1 ⊕ a2) ∨ (t ⊕ s2)
Adam Langley7b935932018-11-12 13:53:42 -0800451//
Adam Langleyeadef472019-01-16 17:20:24 -0800452// (s3, a3) = (s1, a1) - (s2, a2)
453// t = a1 ⊕ a2
454// s3 = (s1 ⊕ a2) ∧ (t ⊕ s2)
455// a3 = t ∨ (s1 ⊕ s2)
456//
457// Negating a value just involves XORing s by a.
458//
Adam Langley7b935932018-11-12 13:53:42 -0800459// struct poly3 {
460// struct poly2 s, a;
461// };
462
463OPENSSL_UNUSED static void poly3_print(const struct poly3 *in) {
464 struct poly3 p;
465 OPENSSL_memcpy(&p, in, sizeof(p));
466 p.s.v[WORDS_PER_POLY - 1] &= ((crypto_word_t)1 << BITS_IN_LAST_WORD) - 1;
467 p.a.v[WORDS_PER_POLY - 1] &= ((crypto_word_t)1 << BITS_IN_LAST_WORD) - 1;
468
469 printf("{[");
470 for (unsigned i = 0; i < WORDS_PER_POLY; i++) {
471 if (i) {
472 printf(" ");
473 }
474 printf(BN_HEX_FMT2, p.s.v[i]);
475 }
476 printf("] [");
477 for (unsigned i = 0; i < WORDS_PER_POLY; i++) {
478 if (i) {
479 printf(" ");
480 }
481 printf(BN_HEX_FMT2, p.a.v[i]);
482 }
483 printf("]}\n");
484}
485
486static void poly3_zero(struct poly3 *p) {
487 poly2_zero(&p->s);
488 poly2_zero(&p->a);
489}
490
Adam Langleycbae9652019-12-06 14:58:52 -0800491// poly3_reverse_700 reverses the order of the first 700 terms of |in| and
492// writes them to |out|.
493static void poly3_reverse_700(struct poly3 *out, const struct poly3 *in) {
494 poly2_reverse_700(&out->a, &in->a);
495 poly2_reverse_700(&out->s, &in->s);
496}
497
498// poly3_word_mul sets (|out_s|, |out_a|) to (|s1|, |a1|) × (|s2|, |a2|).
Adam Langleyeadef472019-01-16 17:20:24 -0800499static void poly3_word_mul(crypto_word_t *out_s, crypto_word_t *out_a,
500 const crypto_word_t s1, const crypto_word_t a1,
501 const crypto_word_t s2, const crypto_word_t a2) {
502 *out_a = a1 & a2;
503 *out_s = (s1 ^ s2) & *out_a;
504}
505
506// poly3_word_add sets (|out_s|, |out_a|) to (|s1|, |a1|) + (|s2|, |a2|).
507static void poly3_word_add(crypto_word_t *out_s, crypto_word_t *out_a,
508 const crypto_word_t s1, const crypto_word_t a1,
509 const crypto_word_t s2, const crypto_word_t a2) {
510 const crypto_word_t t = s1 ^ a2;
511 *out_s = t & (s2 ^ a1);
512 *out_a = (a1 ^ a2) | (t ^ s2);
513}
514
515// poly3_word_sub sets (|out_s|, |out_a|) to (|s1|, |a1|) - (|s2|, |a2|).
516static void poly3_word_sub(crypto_word_t *out_s, crypto_word_t *out_a,
517 const crypto_word_t s1, const crypto_word_t a1,
518 const crypto_word_t s2, const crypto_word_t a2) {
519 const crypto_word_t t = a1 ^ a2;
520 *out_s = (s1 ^ a2) & (t ^ s2);
521 *out_a = t | (s1 ^ s2);
522}
523
Adam Langleyeadef472019-01-16 17:20:24 -0800524// poly3_mul_const sets |p| to |p|×m, where m = (ms, ma).
Adam Langley7b935932018-11-12 13:53:42 -0800525static void poly3_mul_const(struct poly3 *p, crypto_word_t ms,
526 crypto_word_t ma) {
527 ms = lsb_to_all(ms);
528 ma = lsb_to_all(ma);
529
530 for (size_t i = 0; i < WORDS_PER_POLY; i++) {
Adam Langleyeadef472019-01-16 17:20:24 -0800531 poly3_word_mul(&p->s.v[i], &p->a.v[i], p->s.v[i], p->a.v[i], ms, ma);
Adam Langley7b935932018-11-12 13:53:42 -0800532 }
533}
534
Adam Langleyeadef472019-01-16 17:20:24 -0800535// poly3_fmadd sets |out| to |out| - |in|×m, where m is (ms, ma).
536static void poly3_fmsub(struct poly3 *RESTRICT out,
Adam Langley7b935932018-11-12 13:53:42 -0800537 const struct poly3 *RESTRICT in, crypto_word_t ms,
538 crypto_word_t ma) {
Adam Langleyeadef472019-01-16 17:20:24 -0800539 crypto_word_t product_s, product_a;
Adam Langley7b935932018-11-12 13:53:42 -0800540 for (size_t i = 0; i < WORDS_PER_POLY; i++) {
Adam Langleyeadef472019-01-16 17:20:24 -0800541 poly3_word_mul(&product_s, &product_a, in->s.v[i], in->a.v[i], ms, ma);
542 poly3_word_sub(&out->s.v[i], &out->a.v[i], out->s.v[i], out->a.v[i],
543 product_s, product_a);
Adam Langley7b935932018-11-12 13:53:42 -0800544 }
545}
546
547// final_bit_to_all replicates the bit in the final position of the last word to
548// all the bits in the word.
549static crypto_word_t final_bit_to_all(crypto_word_t v) {
550 return lsb_to_all(v >> (BITS_IN_LAST_WORD - 1));
551}
552
553// poly3_top_bits_are_clear returns one iff the extra bits in the final words of
554// |p| are zero.
555OPENSSL_UNUSED static int poly3_top_bits_are_clear(const struct poly3 *p) {
556 return poly2_top_bits_are_clear(&p->s) && poly2_top_bits_are_clear(&p->a);
557}
558
559// poly3_mod_phiN reduces |p| by Φ(N).
560static void poly3_mod_phiN(struct poly3 *p) {
561 // In order to reduce by Φ(N) we subtract by the value of the greatest
Adam Langleyeadef472019-01-16 17:20:24 -0800562 // coefficient.
563 const crypto_word_t factor_s = final_bit_to_all(p->s.v[WORDS_PER_POLY - 1]);
564 const crypto_word_t factor_a = final_bit_to_all(p->a.v[WORDS_PER_POLY - 1]);
Adam Langley7b935932018-11-12 13:53:42 -0800565
566 for (size_t i = 0; i < WORDS_PER_POLY; i++) {
Adam Langleyeadef472019-01-16 17:20:24 -0800567 poly3_word_sub(&p->s.v[i], &p->a.v[i], p->s.v[i], p->a.v[i], factor_s,
568 factor_a);
Adam Langley7b935932018-11-12 13:53:42 -0800569 }
570
571 poly2_clear_top_bits(&p->s);
572 poly2_clear_top_bits(&p->a);
573}
574
575static void poly3_cswap(struct poly3 *a, struct poly3 *b, crypto_word_t swap) {
576 poly2_cswap(&a->s, &b->s, swap);
577 poly2_cswap(&a->a, &b->a, swap);
578}
579
580static void poly3_lshift1(struct poly3 *p) {
581 poly2_lshift1(&p->s);
582 poly2_lshift1(&p->a);
583}
584
585static void poly3_rshift1(struct poly3 *p) {
586 poly2_rshift1(&p->s);
587 poly2_rshift1(&p->a);
588}
589
590// poly3_span represents a pointer into a poly3.
591struct poly3_span {
592 crypto_word_t *s;
593 crypto_word_t *a;
594};
595
Adam Langley7b935932018-11-12 13:53:42 -0800596// poly3_span_add adds |n| words of values from |a| and |b| and writes the
597// result to |out|.
598static void poly3_span_add(const struct poly3_span *out,
599 const struct poly3_span *a,
600 const struct poly3_span *b, size_t n) {
601 for (size_t i = 0; i < n; i++) {
602 poly3_word_add(&out->s[i], &out->a[i], a->s[i], a->a[i], b->s[i], b->a[i]);
603 }
604}
605
606// poly3_span_sub subtracts |n| words of |b| from |n| words of |a|.
607static void poly3_span_sub(const struct poly3_span *a,
608 const struct poly3_span *b, size_t n) {
609 for (size_t i = 0; i < n; i++) {
Adam Langleyeadef472019-01-16 17:20:24 -0800610 poly3_word_sub(&a->s[i], &a->a[i], a->s[i], a->a[i], b->s[i], b->a[i]);
Adam Langley7b935932018-11-12 13:53:42 -0800611 }
612}
613
614// poly3_mul_aux is a recursive function that multiplies |n| words from |a| and
615// |b| and writes 2×|n| words to |out|. Each call uses 2*ceil(n/2) elements of
616// |scratch| and the function recurses, except if |n| == 1, when |scratch| isn't
617// used and the recursion stops. For |n| in {11, 22}, the transitive total
618// amount of |scratch| needed happens to be 2n+2.
619static void poly3_mul_aux(const struct poly3_span *out,
620 const struct poly3_span *scratch,
621 const struct poly3_span *a,
622 const struct poly3_span *b, size_t n) {
623 if (n == 1) {
624 crypto_word_t r_s_low = 0, r_s_high = 0, r_a_low = 0, r_a_high = 0;
625 crypto_word_t b_s = b->s[0], b_a = b->a[0];
626 const crypto_word_t a_s = a->s[0], a_a = a->a[0];
627
628 for (size_t i = 0; i < BITS_PER_WORD; i++) {
629 // Multiply (s, a) by the next value from (b_s, b_a).
Adam Langleyeadef472019-01-16 17:20:24 -0800630 crypto_word_t m_s, m_a;
631 poly3_word_mul(&m_s, &m_a, a_s, a_a, lsb_to_all(b_s), lsb_to_all(b_a));
Adam Langley7b935932018-11-12 13:53:42 -0800632 b_s >>= 1;
633 b_a >>= 1;
634
Adam Langley7b935932018-11-12 13:53:42 -0800635 if (i == 0) {
636 // Special case otherwise the code tries to shift by BITS_PER_WORD
637 // below, which is undefined.
638 r_s_low = m_s;
639 r_a_low = m_a;
640 continue;
641 }
642
643 // Shift the multiplication result to the correct position.
644 const crypto_word_t m_s_low = m_s << i;
645 const crypto_word_t m_s_high = m_s >> (BITS_PER_WORD - i);
646 const crypto_word_t m_a_low = m_a << i;
647 const crypto_word_t m_a_high = m_a >> (BITS_PER_WORD - i);
648
649 // Add into the result.
650 poly3_word_add(&r_s_low, &r_a_low, r_s_low, r_a_low, m_s_low, m_a_low);
651 poly3_word_add(&r_s_high, &r_a_high, r_s_high, r_a_high, m_s_high,
652 m_a_high);
653 }
654
655 out->s[0] = r_s_low;
656 out->s[1] = r_s_high;
657 out->a[0] = r_a_low;
658 out->a[1] = r_a_high;
659 return;
660 }
661
662 // Karatsuba multiplication.
663 // https://en.wikipedia.org/wiki/Karatsuba_algorithm
664
665 // When |n| is odd, the two "halves" will have different lengths. The first
666 // is always the smaller.
667 const size_t low_len = n / 2;
668 const size_t high_len = n - low_len;
669 const struct poly3_span a_high = {&a->s[low_len], &a->a[low_len]};
670 const struct poly3_span b_high = {&b->s[low_len], &b->a[low_len]};
671
672 // Store a_1 + a_0 in the first half of |out| and b_1 + b_0 in the second
673 // half.
674 const struct poly3_span a_cross_sum = *out;
675 const struct poly3_span b_cross_sum = {&out->s[high_len], &out->a[high_len]};
676 poly3_span_add(&a_cross_sum, a, &a_high, low_len);
677 poly3_span_add(&b_cross_sum, b, &b_high, low_len);
678 if (high_len != low_len) {
679 a_cross_sum.s[low_len] = a_high.s[low_len];
680 a_cross_sum.a[low_len] = a_high.a[low_len];
681 b_cross_sum.s[low_len] = b_high.s[low_len];
682 b_cross_sum.a[low_len] = b_high.a[low_len];
683 }
684
685 const struct poly3_span child_scratch = {&scratch->s[2 * high_len],
686 &scratch->a[2 * high_len]};
687 const struct poly3_span out_mid = {&out->s[low_len], &out->a[low_len]};
688 const struct poly3_span out_high = {&out->s[2 * low_len],
689 &out->a[2 * low_len]};
690
691 // Calculate (a_1 + a_0) × (b_1 + b_0) and write to scratch buffer.
692 poly3_mul_aux(scratch, &child_scratch, &a_cross_sum, &b_cross_sum, high_len);
693 // Calculate a_1 × b_1.
694 poly3_mul_aux(&out_high, &child_scratch, &a_high, &b_high, high_len);
695 // Calculate a_0 × b_0.
696 poly3_mul_aux(out, &child_scratch, a, b, low_len);
697
698 // Subtract those last two products from the first.
699 poly3_span_sub(scratch, out, low_len * 2);
700 poly3_span_sub(scratch, &out_high, high_len * 2);
701
702 // Add the middle product into the output.
703 poly3_span_add(&out_mid, &out_mid, scratch, high_len * 2);
704}
705
706// HRSS_poly3_mul sets |*out| to |x|×|y| mod Φ(N).
707void HRSS_poly3_mul(struct poly3 *out, const struct poly3 *x,
708 const struct poly3 *y) {
709 crypto_word_t prod_s[WORDS_PER_POLY * 2];
710 crypto_word_t prod_a[WORDS_PER_POLY * 2];
711 crypto_word_t scratch_s[WORDS_PER_POLY * 2 + 2];
712 crypto_word_t scratch_a[WORDS_PER_POLY * 2 + 2];
713 const struct poly3_span prod_span = {prod_s, prod_a};
714 const struct poly3_span scratch_span = {scratch_s, scratch_a};
715 const struct poly3_span x_span = {(crypto_word_t *)x->s.v,
716 (crypto_word_t *)x->a.v};
717 const struct poly3_span y_span = {(crypto_word_t *)y->s.v,
718 (crypto_word_t *)y->a.v};
719
720 poly3_mul_aux(&prod_span, &scratch_span, &x_span, &y_span, WORDS_PER_POLY);
721
722 // |prod| needs to be reduced mod (𝑥^n - 1), which just involves adding the
723 // upper-half to the lower-half. However, N is 701, which isn't a multiple of
724 // BITS_PER_WORD, so the upper-half vectors all have to be shifted before
725 // being added to the lower-half.
726 for (size_t i = 0; i < WORDS_PER_POLY; i++) {
727 crypto_word_t v_s = prod_s[WORDS_PER_POLY + i - 1] >> BITS_IN_LAST_WORD;
728 v_s |= prod_s[WORDS_PER_POLY + i] << (BITS_PER_WORD - BITS_IN_LAST_WORD);
729 crypto_word_t v_a = prod_a[WORDS_PER_POLY + i - 1] >> BITS_IN_LAST_WORD;
730 v_a |= prod_a[WORDS_PER_POLY + i] << (BITS_PER_WORD - BITS_IN_LAST_WORD);
731
732 poly3_word_add(&out->s.v[i], &out->a.v[i], prod_s[i], prod_a[i], v_s, v_a);
733 }
734
735 poly3_mod_phiN(out);
736}
737
738#if defined(HRSS_HAVE_VECTOR_UNIT) && !defined(OPENSSL_AARCH64)
739
740// poly3_vec_cswap swaps (|a_s|, |a_a|) and (|b_s|, |b_a|) if |swap| is
741// |0xff..ff|. Otherwise, |swap| must be zero.
742static inline void poly3_vec_cswap(vec_t a_s[6], vec_t a_a[6], vec_t b_s[6],
743 vec_t b_a[6], const vec_t swap) {
744 for (int i = 0; i < 6; i++) {
745 const vec_t sum_s = swap & (a_s[i] ^ b_s[i]);
746 a_s[i] ^= sum_s;
747 b_s[i] ^= sum_s;
748
749 const vec_t sum_a = swap & (a_a[i] ^ b_a[i]);
750 a_a[i] ^= sum_a;
751 b_a[i] ^= sum_a;
752 }
753}
754
Adam Langleyeadef472019-01-16 17:20:24 -0800755// poly3_vec_fmsub subtracts (|ms|, |ma|) × (|b_s|, |b_a|) from (|a_s|, |a_a|).
756static inline void poly3_vec_fmsub(vec_t a_s[6], vec_t a_a[6], vec_t b_s[6],
Adam Langley7b935932018-11-12 13:53:42 -0800757 vec_t b_a[6], const vec_t ms,
758 const vec_t ma) {
759 for (int i = 0; i < 6; i++) {
Adam Langleyeadef472019-01-16 17:20:24 -0800760 // See the bitslice formula, above.
Adam Langley7b935932018-11-12 13:53:42 -0800761 const vec_t s = b_s[i];
762 const vec_t a = b_a[i];
Adam Langleyeadef472019-01-16 17:20:24 -0800763 const vec_t product_a = a & ma;
764 const vec_t product_s = (s ^ ms) & product_a;
Adam Langley7b935932018-11-12 13:53:42 -0800765
Adam Langleyeadef472019-01-16 17:20:24 -0800766 const vec_t out_s = a_s[i];
767 const vec_t out_a = a_a[i];
768 const vec_t t = out_a ^ product_a;
769 a_s[i] = (out_s ^ product_a) & (t ^ product_s);
770 a_a[i] = t | (out_s ^ product_s);
Adam Langley7b935932018-11-12 13:53:42 -0800771 }
772}
773
774// poly3_invert_vec sets |*out| to |in|^-1, i.e. such that |out|×|in| == 1 mod
775// Φ(N).
776static void poly3_invert_vec(struct poly3 *out, const struct poly3 *in) {
Adam Langleycbae9652019-12-06 14:58:52 -0800777 // This algorithm is taken from section 7.1 of [SAFEGCD].
Adam Langley7b935932018-11-12 13:53:42 -0800778 const vec_t kZero = {0};
779 const vec_t kOne = {1};
Adam Langley7b935932018-11-12 13:53:42 -0800780 static const uint8_t kBottomSixtyOne[sizeof(vec_t)] = {
781 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x1f};
782
Adam Langleycbae9652019-12-06 14:58:52 -0800783 vec_t v_s[6], v_a[6], r_s[6], r_a[6], f_s[6], f_a[6], g_s[6], g_a[6];
784 // v = 0
785 memset(&v_s, 0, sizeof(v_s));
786 memset(&v_a, 0, sizeof(v_a));
787 // r = 1
788 memset(&r_s, 0, sizeof(r_s));
789 memset(&r_a, 0, sizeof(r_a));
790 r_a[0] = kOne;
791 // f = all ones.
792 memset(f_s, 0, sizeof(f_s));
793 memset(f_a, 0xff, 5 * sizeof(vec_t));
794 memcpy(&f_a[5], kBottomSixtyOne, sizeof(kBottomSixtyOne));
795 // g is the reversal of |in|.
796 struct poly3 in_reversed;
797 poly3_reverse_700(&in_reversed, in);
798 g_s[5] = kZero;
799 memcpy(&g_s, &in_reversed.s.v, WORDS_PER_POLY * sizeof(crypto_word_t));
800 g_a[5] = kZero;
801 memcpy(&g_a, &in_reversed.a.v, WORDS_PER_POLY * sizeof(crypto_word_t));
Adam Langley7b935932018-11-12 13:53:42 -0800802
Adam Langleycbae9652019-12-06 14:58:52 -0800803 int delta = 1;
Adam Langley7b935932018-11-12 13:53:42 -0800804
Adam Langleycbae9652019-12-06 14:58:52 -0800805 for (size_t i = 0; i < (2*(N-1)) - 1; i++) {
806 poly3_vec_lshift1(v_s, v_a);
Adam Langley7b935932018-11-12 13:53:42 -0800807
Adam Langleycbae9652019-12-06 14:58:52 -0800808 const crypto_word_t delta_sign_bit = (delta >> (sizeof(delta) * 8 - 1)) & 1;
809 const crypto_word_t delta_is_non_negative = delta_sign_bit - 1;
810 const crypto_word_t delta_is_non_zero = ~constant_time_is_zero_w(delta);
811 const vec_t g_has_constant_term = vec_broadcast_bit(g_a[0]);
812 const vec_t mask_w =
813 {delta_is_non_negative & delta_is_non_zero};
814 const vec_t mask = vec_broadcast_bit(mask_w) & g_has_constant_term;
Adam Langley7b935932018-11-12 13:53:42 -0800815
Adam Langleycbae9652019-12-06 14:58:52 -0800816 const vec_t c_a = vec_broadcast_bit(f_a[0] & g_a[0]);
817 const vec_t c_s = vec_broadcast_bit((f_s[0] ^ g_s[0]) & c_a);
Adam Langley7b935932018-11-12 13:53:42 -0800818
Adam Langleycbae9652019-12-06 14:58:52 -0800819 delta = constant_time_select_int(lsb_to_all(mask[0]), -delta, delta);
820 delta++;
Adam Langley7b935932018-11-12 13:53:42 -0800821
Adam Langleycbae9652019-12-06 14:58:52 -0800822 poly3_vec_cswap(f_s, f_a, g_s, g_a, mask);
823 poly3_vec_fmsub(g_s, g_a, f_s, f_a, c_s, c_a);
824 poly3_vec_rshift1(g_s, g_a);
Adam Langley7b935932018-11-12 13:53:42 -0800825
Adam Langleycbae9652019-12-06 14:58:52 -0800826 poly3_vec_cswap(v_s, v_a, r_s, r_a, mask);
827 poly3_vec_fmsub(r_s, r_a, v_s, v_a, c_s, c_a);
Adam Langley7b935932018-11-12 13:53:42 -0800828 }
829
Adam Langleycbae9652019-12-06 14:58:52 -0800830 assert(delta == 0);
831 memcpy(out->s.v, v_s, WORDS_PER_POLY * sizeof(crypto_word_t));
832 memcpy(out->a.v, v_a, WORDS_PER_POLY * sizeof(crypto_word_t));
833 poly3_mul_const(out, vec_get_word(f_s[0], 0), vec_get_word(f_a[0], 0));
834 poly3_reverse_700(out, out);
Adam Langley7b935932018-11-12 13:53:42 -0800835}
836
837#endif // HRSS_HAVE_VECTOR_UNIT
838
839// HRSS_poly3_invert sets |*out| to |in|^-1, i.e. such that |out|×|in| == 1 mod
840// Φ(N).
841void HRSS_poly3_invert(struct poly3 *out, const struct poly3 *in) {
842 // The vector version of this function seems slightly slower on AArch64, but
843 // is useful on ARMv7 and x86-64.
844#if defined(HRSS_HAVE_VECTOR_UNIT) && !defined(OPENSSL_AARCH64)
845 if (vec_capable()) {
846 poly3_invert_vec(out, in);
847 return;
848 }
849#endif
850
Adam Langleycbae9652019-12-06 14:58:52 -0800851 // This algorithm is taken from section 7.1 of [SAFEGCD].
852 struct poly3 v, r, f, g;
853 // v = 0
854 poly3_zero(&v);
855 // r = 1
856 poly3_zero(&r);
857 r.a.v[0] = 1;
858 // f = all ones.
859 OPENSSL_memset(&f.s, 0, sizeof(struct poly2));
860 OPENSSL_memset(&f.a, 0xff, sizeof(struct poly2));
861 f.a.v[WORDS_PER_POLY - 1] >>= BITS_PER_WORD - BITS_IN_LAST_WORD;
862 // g is the reversal of |in|.
863 poly3_reverse_700(&g, in);
864 int delta = 1;
Adam Langley7b935932018-11-12 13:53:42 -0800865
Adam Langleycbae9652019-12-06 14:58:52 -0800866 for (size_t i = 0; i < (2*(N-1)) - 1; i++) {
867 poly3_lshift1(&v);
Adam Langley7b935932018-11-12 13:53:42 -0800868
Adam Langleycbae9652019-12-06 14:58:52 -0800869 const crypto_word_t delta_sign_bit = (delta >> (sizeof(delta) * 8 - 1)) & 1;
870 const crypto_word_t delta_is_non_negative = delta_sign_bit - 1;
871 const crypto_word_t delta_is_non_zero = ~constant_time_is_zero_w(delta);
872 const crypto_word_t g_has_constant_term = lsb_to_all(g.a.v[0]);
873 const crypto_word_t mask =
874 g_has_constant_term & delta_is_non_negative & delta_is_non_zero;
Adam Langley7b935932018-11-12 13:53:42 -0800875
Adam Langleycbae9652019-12-06 14:58:52 -0800876 crypto_word_t c_s, c_a;
877 poly3_word_mul(&c_s, &c_a, f.s.v[0], f.a.v[0], g.s.v[0], g.a.v[0]);
878 c_s = lsb_to_all(c_s);
879 c_a = lsb_to_all(c_a);
Adam Langley7b935932018-11-12 13:53:42 -0800880
Adam Langleycbae9652019-12-06 14:58:52 -0800881 delta = constant_time_select_int(mask, -delta, delta);
882 delta++;
Adam Langley7b935932018-11-12 13:53:42 -0800883
Adam Langleycbae9652019-12-06 14:58:52 -0800884 poly3_cswap(&f, &g, mask);
885 poly3_fmsub(&g, &f, c_s, c_a);
886 poly3_rshift1(&g);
Adam Langley7b935932018-11-12 13:53:42 -0800887
Adam Langleycbae9652019-12-06 14:58:52 -0800888 poly3_cswap(&v, &r, mask);
889 poly3_fmsub(&r, &v, c_s, c_a);
Adam Langley7b935932018-11-12 13:53:42 -0800890 }
891
Adam Langleycbae9652019-12-06 14:58:52 -0800892 assert(delta == 0);
893 poly3_mul_const(&v, f.s.v[0], f.a.v[0]);
894 poly3_reverse_700(out, &v);
Adam Langley7b935932018-11-12 13:53:42 -0800895}
896
897// Polynomials in Q.
898
899// Coefficients are reduced mod Q. (Q is clearly not prime, therefore the
900// coefficients do not form a field.)
901#define Q 8192
902
903// VECS_PER_POLY is the number of 128-bit vectors needed to represent a
904// polynomial.
905#define COEFFICIENTS_PER_VEC (sizeof(vec_t) / sizeof(uint16_t))
906#define VECS_PER_POLY ((N + COEFFICIENTS_PER_VEC - 1) / COEFFICIENTS_PER_VEC)
907
908// poly represents a polynomial with coefficients mod Q. Note that, while Q is a
909// power of two, this does not operate in GF(Q). That would be a binary field
910// but this is simply mod Q. Thus the coefficients are not a field.
911//
912// Coefficients are ordered little-endian, thus the coefficient of x^0 is the
913// first element of the array.
914struct poly {
915#if defined(HRSS_HAVE_VECTOR_UNIT)
916 union {
917 // N + 3 = 704, which is a multiple of 64 and thus aligns things, esp for
918 // the vector code.
919 uint16_t v[N + 3];
920 vec_t vectors[VECS_PER_POLY];
921 };
922#else
Adam Langley1ea083d2018-12-12 10:46:13 -0800923 // Even if !HRSS_HAVE_VECTOR_UNIT, external assembly may be called that
924 // requires alignment.
925 alignas(16) uint16_t v[N + 3];
Adam Langley7b935932018-11-12 13:53:42 -0800926#endif
927};
928
Adam Langleyf7e1a942022-04-01 11:46:32 -0700929// poly_normalize zeros out the excess elements of |x| which are included only
930// for alignment.
931static void poly_normalize(struct poly *x) {
932 OPENSSL_memset(&x->v[N], 0, 3 * sizeof(uint16_t));
933}
934
935// poly_assert_normalized asserts that the excess elements of |x| are zeroed out
936// for the cases that case. (E.g. |poly_mul_vec|.)
937static void poly_assert_normalized(const struct poly *x) {
938 assert(x->v[N] == 0);
939 assert(x->v[N + 1] == 0);
940 assert(x->v[N + 2] == 0);
941}
942
Adam Langley7b935932018-11-12 13:53:42 -0800943OPENSSL_UNUSED static void poly_print(const struct poly *p) {
944 printf("[");
945 for (unsigned i = 0; i < N; i++) {
946 if (i) {
947 printf(" ");
948 }
949 printf("%d", p->v[i]);
950 }
951 printf("]\n");
952}
953
Adam Langley71530132021-06-30 16:33:47 -0700954// POLY_MUL_SCRATCH contains space for the working variables needed by
955// |poly_mul|. The contents afterwards may be discarded, but the object may also
956// be reused with future |poly_mul| calls to save heap allocations.
957//
958// This object must have 32-byte alignment.
959struct POLY_MUL_SCRATCH {
960 union {
961 // This is used by |poly_mul_novec|.
962 struct {
963 uint16_t prod[2 * N];
964 uint16_t scratch[1318];
965 } novec;
966
967#if defined(HRSS_HAVE_VECTOR_UNIT)
968 // This is used by |poly_mul_vec|.
969 struct {
970 vec_t prod[VECS_PER_POLY * 2];
971 vec_t scratch[172];
972 } vec;
973#endif
Adam Langley04b3a962023-02-08 21:08:49 +0000974
975#if defined(POLY_RQ_MUL_ASM)
976 // This is the space used by |poly_Rq_mul|.
977 uint8_t rq[POLY_MUL_RQ_SCRATCH_SPACE];
978#endif
Adam Langley71530132021-06-30 16:33:47 -0700979 } u;
980};
981
Adam Langley7b935932018-11-12 13:53:42 -0800982#if defined(HRSS_HAVE_VECTOR_UNIT)
983
984// poly_mul_vec_aux is a recursive function that multiplies |n| words from |a|
985// and |b| and writes 2×|n| words to |out|. Each call uses 2*ceil(n/2) elements
986// of |scratch| and the function recurses, except if |n| < 3, when |scratch|
987// isn't used and the recursion stops. If |n| == |VECS_PER_POLY| then |scratch|
988// needs 172 elements.
989static void poly_mul_vec_aux(vec_t *restrict out, vec_t *restrict scratch,
990 const vec_t *restrict a, const vec_t *restrict b,
991 const size_t n) {
992 // In [HRSS], the technique they used for polynomial multiplication is
993 // described: they start with Toom-4 at the top level and then two layers of
994 // Karatsuba. Karatsuba is a specific instance of the general Toom–Cook
995 // decomposition, which splits an input n-ways and produces 2n-1
996 // multiplications of those parts. So, starting with 704 coefficients (rounded
997 // up from 701 to have more factors of two), Toom-4 gives seven
998 // multiplications of degree-174 polynomials. Each round of Karatsuba (which
999 // is Toom-2) increases the number of multiplications by a factor of three
1000 // while halving the size of the values being multiplied. So two rounds gives
1001 // 63 multiplications of degree-44 polynomials. Then they (I think) form
1002 // vectors by gathering all 63 coefficients of each power together, for each
1003 // input, and doing more rounds of Karatsuba on the vectors until they bottom-
1004 // out somewhere with schoolbook multiplication.
1005 //
1006 // I tried something like that for NEON. NEON vectors are 128 bits so hold
1007 // eight coefficients. I wrote a function that did Karatsuba on eight
1008 // multiplications at the same time, using such vectors, and a Go script that
1009 // decomposed from degree-704, with Karatsuba in non-transposed form, until it
1010 // reached multiplications of degree-44. It batched up those 81
1011 // multiplications into lots of eight with a single one left over (which was
1012 // handled directly).
1013 //
1014 // It worked, but it was significantly slower than the dumb algorithm used
1015 // below. Potentially that was because I misunderstood how [HRSS] did it, or
1016 // because Clang is bad at generating good code from NEON intrinsics on ARMv7.
1017 // (Which is true: the code generated by Clang for the below is pretty crap.)
1018 //
1019 // This algorithm is much simpler. It just does Karatsuba decomposition all
1020 // the way down and never transposes. When it gets down to degree-16 or
1021 // degree-24 values, they are multiplied using schoolbook multiplication and
1022 // vector intrinsics. The vector operations form each of the eight phase-
1023 // shifts of one of the inputs, point-wise multiply, and then add into the
1024 // result at the correct place. This means that 33% (degree-16) or 25%
1025 // (degree-24) of the multiplies and adds are wasted, but it does ok.
1026 if (n == 2) {
1027 vec_t result[4];
1028 vec_t vec_a[3];
1029 static const vec_t kZero = {0};
1030 vec_a[0] = a[0];
1031 vec_a[1] = a[1];
1032 vec_a[2] = kZero;
1033
1034 result[0] = vec_mul(vec_a[0], vec_get_word(b[0], 0));
1035 result[1] = vec_mul(vec_a[1], vec_get_word(b[0], 0));
1036
1037 result[1] = vec_fma(result[1], vec_a[0], vec_get_word(b[1], 0));
1038 result[2] = vec_mul(vec_a[1], vec_get_word(b[1], 0));
1039 result[3] = kZero;
1040
1041 vec3_rshift_word(vec_a);
1042
1043#define BLOCK(x, y) \
1044 do { \
1045 result[x + 0] = \
1046 vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \
1047 result[x + 1] = \
1048 vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \
1049 result[x + 2] = \
1050 vec_fma(result[x + 2], vec_a[2], vec_get_word(b[y / 8], y % 8)); \
1051 } while (0)
1052
1053 BLOCK(0, 1);
1054 BLOCK(1, 9);
1055
1056 vec3_rshift_word(vec_a);
1057
1058 BLOCK(0, 2);
1059 BLOCK(1, 10);
1060
1061 vec3_rshift_word(vec_a);
1062
1063 BLOCK(0, 3);
1064 BLOCK(1, 11);
1065
1066 vec3_rshift_word(vec_a);
1067
1068 BLOCK(0, 4);
1069 BLOCK(1, 12);
1070
1071 vec3_rshift_word(vec_a);
1072
1073 BLOCK(0, 5);
1074 BLOCK(1, 13);
1075
1076 vec3_rshift_word(vec_a);
1077
1078 BLOCK(0, 6);
1079 BLOCK(1, 14);
1080
1081 vec3_rshift_word(vec_a);
1082
1083 BLOCK(0, 7);
1084 BLOCK(1, 15);
1085
1086#undef BLOCK
1087
1088 memcpy(out, result, sizeof(result));
1089 return;
1090 }
1091
1092 if (n == 3) {
1093 vec_t result[6];
1094 vec_t vec_a[4];
1095 static const vec_t kZero = {0};
1096 vec_a[0] = a[0];
1097 vec_a[1] = a[1];
1098 vec_a[2] = a[2];
1099 vec_a[3] = kZero;
1100
1101 result[0] = vec_mul(a[0], vec_get_word(b[0], 0));
1102 result[1] = vec_mul(a[1], vec_get_word(b[0], 0));
1103 result[2] = vec_mul(a[2], vec_get_word(b[0], 0));
1104
1105#define BLOCK_PRE(x, y) \
1106 do { \
1107 result[x + 0] = \
1108 vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \
1109 result[x + 1] = \
1110 vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \
1111 result[x + 2] = vec_mul(vec_a[2], vec_get_word(b[y / 8], y % 8)); \
1112 } while (0)
1113
1114 BLOCK_PRE(1, 8);
1115 BLOCK_PRE(2, 16);
1116
1117 result[5] = kZero;
1118
1119 vec4_rshift_word(vec_a);
1120
1121#define BLOCK(x, y) \
1122 do { \
1123 result[x + 0] = \
1124 vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \
1125 result[x + 1] = \
1126 vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \
1127 result[x + 2] = \
1128 vec_fma(result[x + 2], vec_a[2], vec_get_word(b[y / 8], y % 8)); \
1129 result[x + 3] = \
1130 vec_fma(result[x + 3], vec_a[3], vec_get_word(b[y / 8], y % 8)); \
1131 } while (0)
1132
1133 BLOCK(0, 1);
1134 BLOCK(1, 9);
1135 BLOCK(2, 17);
1136
1137 vec4_rshift_word(vec_a);
1138
1139 BLOCK(0, 2);
1140 BLOCK(1, 10);
1141 BLOCK(2, 18);
1142
1143 vec4_rshift_word(vec_a);
1144
1145 BLOCK(0, 3);
1146 BLOCK(1, 11);
1147 BLOCK(2, 19);
1148
1149 vec4_rshift_word(vec_a);
1150
1151 BLOCK(0, 4);
1152 BLOCK(1, 12);
1153 BLOCK(2, 20);
1154
1155 vec4_rshift_word(vec_a);
1156
1157 BLOCK(0, 5);
1158 BLOCK(1, 13);
1159 BLOCK(2, 21);
1160
1161 vec4_rshift_word(vec_a);
1162
1163 BLOCK(0, 6);
1164 BLOCK(1, 14);
1165 BLOCK(2, 22);
1166
1167 vec4_rshift_word(vec_a);
1168
1169 BLOCK(0, 7);
1170 BLOCK(1, 15);
1171 BLOCK(2, 23);
1172
1173#undef BLOCK
1174#undef BLOCK_PRE
1175
1176 memcpy(out, result, sizeof(result));
1177
1178 return;
1179 }
1180
1181 // Karatsuba multiplication.
1182 // https://en.wikipedia.org/wiki/Karatsuba_algorithm
1183
1184 // When |n| is odd, the two "halves" will have different lengths. The first is
1185 // always the smaller.
1186 const size_t low_len = n / 2;
1187 const size_t high_len = n - low_len;
1188 const vec_t *a_high = &a[low_len];
1189 const vec_t *b_high = &b[low_len];
1190
1191 // Store a_1 + a_0 in the first half of |out| and b_1 + b_0 in the second
1192 // half.
1193 for (size_t i = 0; i < low_len; i++) {
1194 out[i] = vec_add(a_high[i], a[i]);
1195 out[high_len + i] = vec_add(b_high[i], b[i]);
1196 }
1197 if (high_len != low_len) {
1198 out[low_len] = a_high[low_len];
1199 out[high_len + low_len] = b_high[low_len];
1200 }
1201
1202 vec_t *const child_scratch = &scratch[2 * high_len];
1203 // Calculate (a_1 + a_0) × (b_1 + b_0) and write to scratch buffer.
1204 poly_mul_vec_aux(scratch, child_scratch, out, &out[high_len], high_len);
1205 // Calculate a_1 × b_1.
1206 poly_mul_vec_aux(&out[low_len * 2], child_scratch, a_high, b_high, high_len);
1207 // Calculate a_0 × b_0.
1208 poly_mul_vec_aux(out, child_scratch, a, b, low_len);
1209
1210 // Subtract those last two products from the first.
1211 for (size_t i = 0; i < low_len * 2; i++) {
1212 scratch[i] = vec_sub(scratch[i], vec_add(out[i], out[low_len * 2 + i]));
1213 }
1214 if (low_len != high_len) {
1215 scratch[low_len * 2] = vec_sub(scratch[low_len * 2], out[low_len * 4]);
1216 scratch[low_len * 2 + 1] =
1217 vec_sub(scratch[low_len * 2 + 1], out[low_len * 4 + 1]);
1218 }
1219
1220 // Add the middle product into the output.
1221 for (size_t i = 0; i < high_len * 2; i++) {
1222 out[low_len + i] = vec_add(out[low_len + i], scratch[i]);
1223 }
1224}
1225
1226// poly_mul_vec sets |*out| to |x|×|y| mod (𝑥^n - 1).
Adam Langley71530132021-06-30 16:33:47 -07001227static void poly_mul_vec(struct POLY_MUL_SCRATCH *scratch, struct poly *out,
1228 const struct poly *x, const struct poly *y) {
David Benjaminb7d63202022-07-26 13:25:02 -07001229 static_assert(sizeof(out->v) == sizeof(vec_t) * VECS_PER_POLY,
1230 "struct poly is the wrong size");
1231 static_assert(alignof(struct poly) == alignof(vec_t),
1232 "struct poly has incorrect alignment");
Adam Langleyf7e1a942022-04-01 11:46:32 -07001233 poly_assert_normalized(x);
1234 poly_assert_normalized(y);
Adam Langley7b935932018-11-12 13:53:42 -08001235
Adam Langley71530132021-06-30 16:33:47 -07001236 vec_t *const prod = scratch->u.vec.prod;
1237 vec_t *const aux_scratch = scratch->u.vec.scratch;
1238 poly_mul_vec_aux(prod, aux_scratch, x->vectors, y->vectors, VECS_PER_POLY);
Adam Langley7b935932018-11-12 13:53:42 -08001239
1240 // |prod| needs to be reduced mod (𝑥^n - 1), which just involves adding the
1241 // upper-half to the lower-half. However, N is 701, which isn't a multiple of
1242 // the vector size, so the upper-half vectors all have to be shifted before
1243 // being added to the lower-half.
1244 vec_t *out_vecs = (vec_t *)out->v;
1245
1246 for (size_t i = 0; i < VECS_PER_POLY; i++) {
1247 const vec_t prev = prod[VECS_PER_POLY - 1 + i];
1248 const vec_t this = prod[VECS_PER_POLY + i];
1249 out_vecs[i] = vec_add(prod[i], vec_merge_3_5(prev, this));
1250 }
1251
1252 OPENSSL_memset(&out->v[N], 0, 3 * sizeof(uint16_t));
1253}
1254
1255#endif // HRSS_HAVE_VECTOR_UNIT
1256
1257// poly_mul_novec_aux writes the product of |a| and |b| to |out|, using
1258// |scratch| as scratch space. It'll use Karatsuba if the inputs are large
1259// enough to warrant it. Each call uses 2*ceil(n/2) elements of |scratch| and
1260// the function recurses, except if |n| < 64, when |scratch| isn't used and the
1261// recursion stops. If |n| == |N| then |scratch| needs 1318 elements.
1262static void poly_mul_novec_aux(uint16_t *out, uint16_t *scratch,
1263 const uint16_t *a, const uint16_t *b, size_t n) {
1264 static const size_t kSchoolbookLimit = 64;
1265 if (n < kSchoolbookLimit) {
1266 OPENSSL_memset(out, 0, sizeof(uint16_t) * n * 2);
1267 for (size_t i = 0; i < n; i++) {
1268 for (size_t j = 0; j < n; j++) {
1269 out[i + j] += (unsigned) a[i] * b[j];
1270 }
1271 }
1272
1273 return;
1274 }
1275
1276 // Karatsuba multiplication.
1277 // https://en.wikipedia.org/wiki/Karatsuba_algorithm
1278
1279 // When |n| is odd, the two "halves" will have different lengths. The
1280 // first is always the smaller.
1281 const size_t low_len = n / 2;
1282 const size_t high_len = n - low_len;
1283 const uint16_t *const a_high = &a[low_len];
1284 const uint16_t *const b_high = &b[low_len];
1285
1286 for (size_t i = 0; i < low_len; i++) {
1287 out[i] = a_high[i] + a[i];
1288 out[high_len + i] = b_high[i] + b[i];
1289 }
1290 if (high_len != low_len) {
1291 out[low_len] = a_high[low_len];
1292 out[high_len + low_len] = b_high[low_len];
1293 }
1294
1295 uint16_t *const child_scratch = &scratch[2 * high_len];
1296 poly_mul_novec_aux(scratch, child_scratch, out, &out[high_len], high_len);
1297 poly_mul_novec_aux(&out[low_len * 2], child_scratch, a_high, b_high,
1298 high_len);
1299 poly_mul_novec_aux(out, child_scratch, a, b, low_len);
1300
1301 for (size_t i = 0; i < low_len * 2; i++) {
1302 scratch[i] -= out[i] + out[low_len * 2 + i];
1303 }
1304 if (low_len != high_len) {
1305 scratch[low_len * 2] -= out[low_len * 4];
1306 assert(out[low_len * 4 + 1] == 0);
1307 }
1308
1309 for (size_t i = 0; i < high_len * 2; i++) {
1310 out[low_len + i] += scratch[i];
1311 }
1312}
1313
1314// poly_mul_novec sets |*out| to |x|×|y| mod (𝑥^n - 1).
Adam Langley71530132021-06-30 16:33:47 -07001315static void poly_mul_novec(struct POLY_MUL_SCRATCH *scratch, struct poly *out,
1316 const struct poly *x, const struct poly *y) {
1317 uint16_t *const prod = scratch->u.novec.prod;
1318 uint16_t *const aux_scratch = scratch->u.novec.scratch;
1319 poly_mul_novec_aux(prod, aux_scratch, x->v, y->v, N);
Adam Langley7b935932018-11-12 13:53:42 -08001320
1321 for (size_t i = 0; i < N; i++) {
1322 out->v[i] = prod[i] + prod[i + N];
1323 }
1324 OPENSSL_memset(&out->v[N], 0, 3 * sizeof(uint16_t));
1325}
1326
Adam Langley71530132021-06-30 16:33:47 -07001327static void poly_mul(struct POLY_MUL_SCRATCH *scratch, struct poly *r,
1328 const struct poly *a, const struct poly *b) {
Adam Langley04b3a962023-02-08 21:08:49 +00001329#if defined(POLY_RQ_MUL_ASM)
1330 if (CRYPTO_is_AVX2_capable()) {
1331 poly_Rq_mul(r->v, a->v, b->v, scratch->u.rq);
1332 poly_normalize(r);
1333 } else
1334#endif
1335
Adam Langley7b935932018-11-12 13:53:42 -08001336#if defined(HRSS_HAVE_VECTOR_UNIT)
1337 if (vec_capable()) {
Adam Langley71530132021-06-30 16:33:47 -07001338 poly_mul_vec(scratch, r, a, b);
Adam Langleyf7e1a942022-04-01 11:46:32 -07001339 } else
Adam Langley7b935932018-11-12 13:53:42 -08001340#endif
1341
Adam Langley7b935932018-11-12 13:53:42 -08001342 // Fallback, non-vector case.
Adam Langleyf7e1a942022-04-01 11:46:32 -07001343 {
1344 poly_mul_novec(scratch, r, a, b);
1345 }
1346
1347 poly_assert_normalized(r);
Adam Langley7b935932018-11-12 13:53:42 -08001348}
1349
1350// poly_mul_x_minus_1 sets |p| to |p|×(𝑥 - 1) mod (𝑥^n - 1).
1351static void poly_mul_x_minus_1(struct poly *p) {
1352 // Multiplying by (𝑥 - 1) means negating each coefficient and adding in
1353 // the value of the previous one.
1354 const uint16_t orig_final_coefficient = p->v[N - 1];
1355
1356 for (size_t i = N - 1; i > 0; i--) {
1357 p->v[i] = p->v[i - 1] - p->v[i];
1358 }
1359 p->v[0] = orig_final_coefficient - p->v[0];
1360}
1361
1362// poly_mod_phiN sets |p| to |p| mod Φ(N).
1363static void poly_mod_phiN(struct poly *p) {
1364 const uint16_t coeff700 = p->v[N - 1];
1365
1366 for (unsigned i = 0; i < N; i++) {
1367 p->v[i] -= coeff700;
1368 }
1369}
1370
1371// poly_clamp reduces each coefficient mod Q.
1372static void poly_clamp(struct poly *p) {
1373 for (unsigned i = 0; i < N; i++) {
1374 p->v[i] &= Q - 1;
1375 }
1376}
1377
1378
1379// Conversion functions
1380// --------------------
1381
1382// poly2_from_poly sets |*out| to |in| mod 2.
1383static void poly2_from_poly(struct poly2 *out, const struct poly *in) {
1384 crypto_word_t *words = out->v;
1385 unsigned shift = 0;
1386 crypto_word_t word = 0;
1387
1388 for (unsigned i = 0; i < N; i++) {
1389 word >>= 1;
1390 word |= (crypto_word_t)(in->v[i] & 1) << (BITS_PER_WORD - 1);
1391 shift++;
1392
1393 if (shift == BITS_PER_WORD) {
1394 *words = word;
1395 words++;
1396 word = 0;
1397 shift = 0;
1398 }
1399 }
1400
1401 word >>= BITS_PER_WORD - shift;
1402 *words = word;
1403}
1404
Adam Langley72f01552019-01-21 12:53:43 -08001405// mod3 treats |a| as a signed number and returns |a| mod 3.
Adam Langley7b935932018-11-12 13:53:42 -08001406static uint16_t mod3(int16_t a) {
1407 const int16_t q = ((int32_t)a * 21845) >> 16;
1408 int16_t ret = a - 3 * q;
1409 // At this point, |ret| is in {0, 1, 2, 3} and that needs to be mapped to {0,
1410 // 1, 2, 0}.
1411 return ret & ((ret & (ret >> 1)) - 1);
1412}
1413
1414// poly3_from_poly sets |*out| to |in|.
1415static void poly3_from_poly(struct poly3 *out, const struct poly *in) {
1416 crypto_word_t *words_s = out->s.v;
1417 crypto_word_t *words_a = out->a.v;
1418 crypto_word_t s = 0;
1419 crypto_word_t a = 0;
1420 unsigned shift = 0;
1421
1422 for (unsigned i = 0; i < N; i++) {
1423 // This duplicates the 13th bit upwards to the top of the uint16,
1424 // essentially treating it as a sign bit and converting into a signed int16.
1425 // The signed value is reduced mod 3, yielding {0, 1, 2}.
1426 const uint16_t v = mod3((int16_t)(in->v[i] << 3) >> 3);
1427 s >>= 1;
Adam Langleyeadef472019-01-16 17:20:24 -08001428 const crypto_word_t s_bit = (crypto_word_t)(v & 2) << (BITS_PER_WORD - 2);
1429 s |= s_bit;
Adam Langley7b935932018-11-12 13:53:42 -08001430 a >>= 1;
Adam Langleyeadef472019-01-16 17:20:24 -08001431 a |= s_bit | (crypto_word_t)(v & 1) << (BITS_PER_WORD - 1);
Adam Langley7b935932018-11-12 13:53:42 -08001432 shift++;
1433
1434 if (shift == BITS_PER_WORD) {
1435 *words_s = s;
1436 words_s++;
1437 *words_a = a;
1438 words_a++;
1439 s = a = 0;
1440 shift = 0;
1441 }
1442 }
1443
1444 s >>= BITS_PER_WORD - shift;
1445 a >>= BITS_PER_WORD - shift;
1446 *words_s = s;
1447 *words_a = a;
1448}
1449
1450// poly3_from_poly_checked sets |*out| to |in|, which has coefficients in {0, 1,
1451// Q-1}. It returns a mask indicating whether all coefficients were found to be
1452// in that set.
1453static crypto_word_t poly3_from_poly_checked(struct poly3 *out,
1454 const struct poly *in) {
1455 crypto_word_t *words_s = out->s.v;
1456 crypto_word_t *words_a = out->a.v;
1457 crypto_word_t s = 0;
1458 crypto_word_t a = 0;
1459 unsigned shift = 0;
1460 crypto_word_t ok = CONSTTIME_TRUE_W;
1461
1462 for (unsigned i = 0; i < N; i++) {
1463 const uint16_t v = in->v[i];
1464 // Maps {0, 1, Q-1} to {0, 1, 2}.
1465 uint16_t mod3 = v & 3;
1466 mod3 ^= mod3 >> 1;
1467 const uint16_t expected = (uint16_t)((~((mod3 >> 1) - 1)) | mod3) % Q;
1468 ok &= constant_time_eq_w(v, expected);
1469
1470 s >>= 1;
Adam Langleyeadef472019-01-16 17:20:24 -08001471 const crypto_word_t s_bit = (crypto_word_t)(mod3 & 2)
1472 << (BITS_PER_WORD - 2);
1473 s |= s_bit;
Adam Langley7b935932018-11-12 13:53:42 -08001474 a >>= 1;
Adam Langleyeadef472019-01-16 17:20:24 -08001475 a |= s_bit | (crypto_word_t)(mod3 & 1) << (BITS_PER_WORD - 1);
Adam Langley7b935932018-11-12 13:53:42 -08001476 shift++;
1477
1478 if (shift == BITS_PER_WORD) {
1479 *words_s = s;
1480 words_s++;
1481 *words_a = a;
1482 words_a++;
1483 s = a = 0;
1484 shift = 0;
1485 }
1486 }
1487
1488 s >>= BITS_PER_WORD - shift;
1489 a >>= BITS_PER_WORD - shift;
1490 *words_s = s;
1491 *words_a = a;
1492
1493 return ok;
1494}
1495
1496static void poly_from_poly2(struct poly *out, const struct poly2 *in) {
1497 const crypto_word_t *words = in->v;
1498 unsigned shift = 0;
1499 crypto_word_t word = *words;
1500
1501 for (unsigned i = 0; i < N; i++) {
1502 out->v[i] = word & 1;
1503 word >>= 1;
1504 shift++;
1505
1506 if (shift == BITS_PER_WORD) {
1507 words++;
1508 word = *words;
1509 shift = 0;
1510 }
1511 }
Adam Langleyf7e1a942022-04-01 11:46:32 -07001512
1513 poly_normalize(out);
Adam Langley7b935932018-11-12 13:53:42 -08001514}
1515
1516static void poly_from_poly3(struct poly *out, const struct poly3 *in) {
1517 const crypto_word_t *words_s = in->s.v;
1518 const crypto_word_t *words_a = in->a.v;
1519 crypto_word_t word_s = ~(*words_s);
1520 crypto_word_t word_a = *words_a;
1521 unsigned shift = 0;
1522
1523 for (unsigned i = 0; i < N; i++) {
1524 out->v[i] = (uint16_t)(word_s & 1) - 1;
1525 out->v[i] |= word_a & 1;
1526 word_s >>= 1;
1527 word_a >>= 1;
1528 shift++;
1529
1530 if (shift == BITS_PER_WORD) {
1531 words_s++;
1532 words_a++;
1533 word_s = ~(*words_s);
1534 word_a = *words_a;
1535 shift = 0;
1536 }
1537 }
Adam Langleyf7e1a942022-04-01 11:46:32 -07001538
1539 poly_normalize(out);
Adam Langley7b935932018-11-12 13:53:42 -08001540}
1541
1542// Polynomial inversion
1543// --------------------
1544
1545// poly_invert_mod2 sets |*out| to |in^-1| (i.e. such that |*out|×|in| = 1 mod
1546// Φ(N)), all mod 2. This isn't useful in itself, but is part of doing inversion
1547// mod Q.
1548static void poly_invert_mod2(struct poly *out, const struct poly *in) {
Adam Langleycbae9652019-12-06 14:58:52 -08001549 // This algorithm is taken from section 7.1 of [SAFEGCD].
1550 struct poly2 v, r, f, g;
Adam Langley7b935932018-11-12 13:53:42 -08001551
Adam Langleycbae9652019-12-06 14:58:52 -08001552 // v = 0
1553 poly2_zero(&v);
1554 // r = 1
1555 poly2_zero(&r);
1556 r.v[0] = 1;
1557 // f = all ones.
1558 OPENSSL_memset(&f, 0xff, sizeof(struct poly2));
1559 f.v[WORDS_PER_POLY - 1] >>= BITS_PER_WORD - BITS_IN_LAST_WORD;
1560 // g is the reversal of |in|.
1561 poly2_from_poly(&g, in);
1562 poly2_mod_phiN(&g);
1563 poly2_reverse_700(&g, &g);
1564 int delta = 1;
Adam Langley7b935932018-11-12 13:53:42 -08001565
Adam Langleycbae9652019-12-06 14:58:52 -08001566 for (size_t i = 0; i < (2*(N-1)) - 1; i++) {
1567 poly2_lshift1(&v);
Adam Langley7b935932018-11-12 13:53:42 -08001568
Adam Langleycbae9652019-12-06 14:58:52 -08001569 const crypto_word_t delta_sign_bit = (delta >> (sizeof(delta) * 8 - 1)) & 1;
1570 const crypto_word_t delta_is_non_negative = delta_sign_bit - 1;
1571 const crypto_word_t delta_is_non_zero = ~constant_time_is_zero_w(delta);
1572 const crypto_word_t g_has_constant_term = lsb_to_all(g.v[0]);
1573 const crypto_word_t mask =
1574 g_has_constant_term & delta_is_non_negative & delta_is_non_zero;
Adam Langley7b935932018-11-12 13:53:42 -08001575
Adam Langleycbae9652019-12-06 14:58:52 -08001576 const crypto_word_t c = lsb_to_all(f.v[0] & g.v[0]);
Adam Langley7b935932018-11-12 13:53:42 -08001577
Adam Langleycbae9652019-12-06 14:58:52 -08001578 delta = constant_time_select_int(mask, -delta, delta);
1579 delta++;
Adam Langley7b935932018-11-12 13:53:42 -08001580
Adam Langleycbae9652019-12-06 14:58:52 -08001581 poly2_cswap(&f, &g, mask);
1582 poly2_fmadd(&g, &f, c);
1583 poly2_rshift1(&g);
1584
1585 poly2_cswap(&v, &r, mask);
1586 poly2_fmadd(&r, &v, c);
Adam Langley7b935932018-11-12 13:53:42 -08001587 }
1588
Adam Langleycbae9652019-12-06 14:58:52 -08001589 assert(delta == 0);
1590 assert(f.v[0] & 1);
1591 poly2_reverse_700(&v, &v);
1592 poly_from_poly2(out, &v);
Adam Langleyf7e1a942022-04-01 11:46:32 -07001593 poly_assert_normalized(out);
Adam Langley7b935932018-11-12 13:53:42 -08001594}
1595
1596// poly_invert sets |*out| to |in^-1| (i.e. such that |*out|×|in| = 1 mod Φ(N)).
Adam Langley71530132021-06-30 16:33:47 -07001597static void poly_invert(struct POLY_MUL_SCRATCH *scratch, struct poly *out,
1598 const struct poly *in) {
Adam Langley7b935932018-11-12 13:53:42 -08001599 // Inversion mod Q, which is done based on the result of inverting mod
1600 // 2. See [NTRUTN14] paper, bottom of page two.
1601 struct poly a, *b, tmp;
1602
1603 // a = -in.
1604 for (unsigned i = 0; i < N; i++) {
1605 a.v[i] = -in->v[i];
1606 }
Adam Langleyf7e1a942022-04-01 11:46:32 -07001607 poly_normalize(&a);
Adam Langley7b935932018-11-12 13:53:42 -08001608
1609 // b = in^-1 mod 2.
1610 b = out;
1611 poly_invert_mod2(b, in);
1612
1613 // We are working mod Q=2**13 and we need to iterate ceil(log_2(13))
1614 // times, which is four.
1615 for (unsigned i = 0; i < 4; i++) {
Adam Langley71530132021-06-30 16:33:47 -07001616 poly_mul(scratch, &tmp, &a, b);
Adam Langley7b935932018-11-12 13:53:42 -08001617 tmp.v[0] += 2;
Adam Langley71530132021-06-30 16:33:47 -07001618 poly_mul(scratch, b, b, &tmp);
Adam Langley7b935932018-11-12 13:53:42 -08001619 }
Adam Langleyf7e1a942022-04-01 11:46:32 -07001620
1621 poly_assert_normalized(out);
Adam Langley7b935932018-11-12 13:53:42 -08001622}
1623
1624// Marshal and unmarshal functions for various basic types.
1625// --------------------------------------------------------
1626
1627#define POLY_BYTES 1138
1628
Adam Langley9700b442018-12-17 11:17:26 -08001629// poly_marshal serialises all but the final coefficient of |in| to |out|.
Adam Langley7b935932018-11-12 13:53:42 -08001630static void poly_marshal(uint8_t out[POLY_BYTES], const struct poly *in) {
1631 const uint16_t *p = in->v;
1632
1633 for (size_t i = 0; i < N / 8; i++) {
1634 out[0] = p[0];
1635 out[1] = (0x1f & (p[0] >> 8)) | ((p[1] & 0x07) << 5);
1636 out[2] = p[1] >> 3;
1637 out[3] = (3 & (p[1] >> 11)) | ((p[2] & 0x3f) << 2);
1638 out[4] = (0x7f & (p[2] >> 6)) | ((p[3] & 0x01) << 7);
1639 out[5] = p[3] >> 1;
1640 out[6] = (0xf & (p[3] >> 9)) | ((p[4] & 0x0f) << 4);
1641 out[7] = p[4] >> 4;
1642 out[8] = (1 & (p[4] >> 12)) | ((p[5] & 0x7f) << 1);
1643 out[9] = (0x3f & (p[5] >> 7)) | ((p[6] & 0x03) << 6);
1644 out[10] = p[6] >> 2;
1645 out[11] = (7 & (p[6] >> 10)) | ((p[7] & 0x1f) << 3);
1646 out[12] = p[7] >> 5;
1647
1648 p += 8;
1649 out += 13;
1650 }
1651
1652 // There are four remaining values.
1653 out[0] = p[0];
1654 out[1] = (0x1f & (p[0] >> 8)) | ((p[1] & 0x07) << 5);
1655 out[2] = p[1] >> 3;
1656 out[3] = (3 & (p[1] >> 11)) | ((p[2] & 0x3f) << 2);
1657 out[4] = (0x7f & (p[2] >> 6)) | ((p[3] & 0x01) << 7);
1658 out[5] = p[3] >> 1;
1659 out[6] = 0xf & (p[3] >> 9);
1660}
1661
Adam Langley9700b442018-12-17 11:17:26 -08001662// poly_unmarshal parses the output of |poly_marshal| and sets |out| such that
1663// all but the final coefficients match, and the final coefficient is calculated
1664// such that evaluating |out| at one results in zero. It returns one on success
1665// or zero if |in| is an invalid encoding.
Adam Langleyf8068ce2018-12-17 11:16:23 -08001666static int poly_unmarshal(struct poly *out, const uint8_t in[POLY_BYTES]) {
Adam Langley7b935932018-11-12 13:53:42 -08001667 uint16_t *p = out->v;
1668
1669 for (size_t i = 0; i < N / 8; i++) {
1670 p[0] = (uint16_t)(in[0]) | (uint16_t)(in[1] & 0x1f) << 8;
1671 p[1] = (uint16_t)(in[1] >> 5) | (uint16_t)(in[2]) << 3 |
1672 (uint16_t)(in[3] & 3) << 11;
1673 p[2] = (uint16_t)(in[3] >> 2) | (uint16_t)(in[4] & 0x7f) << 6;
1674 p[3] = (uint16_t)(in[4] >> 7) | (uint16_t)(in[5]) << 1 |
1675 (uint16_t)(in[6] & 0xf) << 9;
1676 p[4] = (uint16_t)(in[6] >> 4) | (uint16_t)(in[7]) << 4 |
1677 (uint16_t)(in[8] & 1) << 12;
1678 p[5] = (uint16_t)(in[8] >> 1) | (uint16_t)(in[9] & 0x3f) << 7;
1679 p[6] = (uint16_t)(in[9] >> 6) | (uint16_t)(in[10]) << 2 |
1680 (uint16_t)(in[11] & 7) << 10;
1681 p[7] = (uint16_t)(in[11] >> 3) | (uint16_t)(in[12]) << 5;
1682
1683 p += 8;
1684 in += 13;
1685 }
1686
1687 // There are four coefficients remaining.
1688 p[0] = (uint16_t)(in[0]) | (uint16_t)(in[1] & 0x1f) << 8;
1689 p[1] = (uint16_t)(in[1] >> 5) | (uint16_t)(in[2]) << 3 |
1690 (uint16_t)(in[3] & 3) << 11;
1691 p[2] = (uint16_t)(in[3] >> 2) | (uint16_t)(in[4] & 0x7f) << 6;
1692 p[3] = (uint16_t)(in[4] >> 7) | (uint16_t)(in[5]) << 1 |
1693 (uint16_t)(in[6] & 0xf) << 9;
1694
1695 for (unsigned i = 0; i < N - 1; i++) {
1696 out->v[i] = (int16_t)(out->v[i] << 3) >> 3;
1697 }
1698
Adam Langleyf8068ce2018-12-17 11:16:23 -08001699 // There are four unused bits in the last byte. We require them to be zero.
1700 if ((in[6] & 0xf0) != 0) {
1701 return 0;
1702 }
Adam Langley7b935932018-11-12 13:53:42 -08001703
1704 // Set the final coefficient as specifed in [HRSSNIST] 1.9.2 step 6.
1705 uint32_t sum = 0;
1706 for (size_t i = 0; i < N - 1; i++) {
1707 sum += out->v[i];
1708 }
1709
1710 out->v[N - 1] = (uint16_t)(0u - sum);
Adam Langleyf7e1a942022-04-01 11:46:32 -07001711 poly_normalize(out);
Adam Langleyf8068ce2018-12-17 11:16:23 -08001712
1713 return 1;
Adam Langley7b935932018-11-12 13:53:42 -08001714}
1715
1716// mod3_from_modQ maps {0, 1, Q-1, 65535} -> {0, 1, 2, 2}. Note that |v| may
1717// have an invalid value when processing attacker-controlled inputs.
1718static uint16_t mod3_from_modQ(uint16_t v) {
1719 v &= 3;
1720 return v ^ (v >> 1);
1721}
1722
1723// poly_marshal_mod3 marshals |in| to |out| where the coefficients of |in| are
1724// all in {0, 1, Q-1, 65535} and |in| is mod Φ(N). (Note that coefficients may
1725// have invalid values when processing attacker-controlled inputs.)
1726static void poly_marshal_mod3(uint8_t out[HRSS_POLY3_BYTES],
1727 const struct poly *in) {
1728 const uint16_t *coeffs = in->v;
1729
1730 // Only 700 coefficients are marshaled because in[700] must be zero.
1731 assert(coeffs[N-1] == 0);
1732
1733 for (size_t i = 0; i < HRSS_POLY3_BYTES; i++) {
1734 const uint16_t coeffs0 = mod3_from_modQ(coeffs[0]);
1735 const uint16_t coeffs1 = mod3_from_modQ(coeffs[1]);
1736 const uint16_t coeffs2 = mod3_from_modQ(coeffs[2]);
1737 const uint16_t coeffs3 = mod3_from_modQ(coeffs[3]);
1738 const uint16_t coeffs4 = mod3_from_modQ(coeffs[4]);
1739 out[i] = coeffs0 + coeffs1 * 3 + coeffs2 * 9 + coeffs3 * 27 + coeffs4 * 81;
1740 coeffs += 5;
1741 }
1742}
1743
1744// HRSS-specific functions
1745// -----------------------
1746
Adam Langley72f01552019-01-21 12:53:43 -08001747// poly_short_sample samples a vector of values in {0xffff (i.e. -1), 0, 1}.
1748// This is the same action as the algorithm in [HRSSNIST] section 1.8.1, but
1749// with HRSS-SXY the sampling algorithm is now a private detail of the
1750// implementation (previously it had to match between two parties). This
1751// function uses that freedom to implement a flatter distribution of values.
Adam Langley7b935932018-11-12 13:53:42 -08001752static void poly_short_sample(struct poly *out,
1753 const uint8_t in[HRSS_SAMPLE_BYTES]) {
David Benjaminb7d63202022-07-26 13:25:02 -07001754 static_assert(HRSS_SAMPLE_BYTES == N - 1, "HRSS_SAMPLE_BYTES incorrect");
Adam Langley72f01552019-01-21 12:53:43 -08001755 for (size_t i = 0; i < N - 1; i++) {
1756 uint16_t v = mod3(in[i]);
1757 // Map {0, 1, 2} -> {0, 1, 0xffff}
1758 v |= ((v >> 1) ^ 1) - 1;
1759 out->v[i] = v;
Adam Langley7b935932018-11-12 13:53:42 -08001760 }
Adam Langley7b935932018-11-12 13:53:42 -08001761 out->v[N - 1] = 0;
Adam Langleyf7e1a942022-04-01 11:46:32 -07001762 poly_normalize(out);
Adam Langley7b935932018-11-12 13:53:42 -08001763}
1764
1765// poly_short_sample_plus performs the T+ sample as defined in [HRSSNIST],
1766// section 1.8.2.
1767static void poly_short_sample_plus(struct poly *out,
1768 const uint8_t in[HRSS_SAMPLE_BYTES]) {
1769 poly_short_sample(out, in);
1770
1771 // sum (and the product in the for loop) will overflow. But that's fine
1772 // because |sum| is bound by +/- (N-2), and N < 2^15 so it works out.
1773 uint16_t sum = 0;
1774 for (unsigned i = 0; i < N - 2; i++) {
1775 sum += (unsigned) out->v[i] * out->v[i + 1];
1776 }
1777
1778 // If the sum is negative, flip the sign of even-positioned coefficients. (See
1779 // page 8 of [HRSS].)
1780 sum = ((int16_t) sum) >> 15;
1781 const uint16_t scale = sum | (~sum & 1);
1782 for (unsigned i = 0; i < N; i += 2) {
1783 out->v[i] = (unsigned) out->v[i] * scale;
1784 }
Adam Langleyf7e1a942022-04-01 11:46:32 -07001785 poly_assert_normalized(out);
Adam Langley7b935932018-11-12 13:53:42 -08001786}
1787
1788// poly_lift computes the function discussed in [HRSS], appendix B.
1789static void poly_lift(struct poly *out, const struct poly *a) {
1790 // We wish to calculate a/(𝑥-1) mod Φ(N) over GF(3), where Φ(N) is the
1791 // Nth cyclotomic polynomial, i.e. 1 + 𝑥 + … + 𝑥^700 (since N is prime).
1792
1793 // 1/(𝑥-1) has a fairly basic structure that we can exploit to speed this up:
1794 //
1795 // R.<x> = PolynomialRing(GF(3)…)
1796 // inv = R.cyclotomic_polynomial(1).inverse_mod(R.cyclotomic_polynomial(n))
1797 // list(inv)[:15]
1798 // [1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2]
1799 //
1800 // This three-element pattern of coefficients repeats for the whole
1801 // polynomial.
1802 //
1803 // Next define the overbar operator such that z̅ = z[0] +
1804 // reverse(z[1:]). (Index zero of a polynomial here is the coefficient
1805 // of the constant term. So index one is the coefficient of 𝑥 and so
1806 // on.)
1807 //
1808 // A less odd way to define this is to see that z̅ negates the indexes,
1809 // so z̅[0] = z[-0], z̅[1] = z[-1] and so on.
1810 //
1811 // The use of z̅ is that, when working mod (𝑥^701 - 1), vz[0] = <v,
1812 // z̅>, vz[1] = <v, 𝑥z̅>, …. (Where <a, b> is the inner product: the sum
1813 // of the point-wise products.) Although we calculated the inverse mod
1814 // Φ(N), we can work mod (𝑥^N - 1) and reduce mod Φ(N) at the end.
1815 // (That's because (𝑥^N - 1) is a multiple of Φ(N).)
1816 //
1817 // When working mod (𝑥^N - 1), multiplication by 𝑥 is a right-rotation
1818 // of the list of coefficients.
1819 //
1820 // Thus we can consider what the pattern of z̅, 𝑥z̅, 𝑥^2z̅, … looks like:
1821 //
1822 // def reverse(xs):
1823 // suffix = list(xs[1:])
1824 // suffix.reverse()
1825 // return [xs[0]] + suffix
1826 //
1827 // def rotate(xs):
1828 // return [xs[-1]] + xs[:-1]
1829 //
1830 // zoverbar = reverse(list(inv) + [0])
1831 // xzoverbar = rotate(reverse(list(inv) + [0]))
1832 // x2zoverbar = rotate(rotate(reverse(list(inv) + [0])))
1833 //
1834 // zoverbar[:15]
1835 // [1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1]
1836 // xzoverbar[:15]
1837 // [0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0]
1838 // x2zoverbar[:15]
1839 // [2, 0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]
1840 //
1841 // (For a formula for z̅, see lemma two of appendix B.)
1842 //
1843 // After the first three elements have been taken care of, all then have
1844 // a repeating three-element cycle. The next value (𝑥^3z̅) involves
1845 // three rotations of the first pattern, thus the three-element cycle
1846 // lines up. However, the discontinuity in the first three elements
1847 // obviously moves to a different position. Consider the difference
1848 // between 𝑥^3z̅ and z̅:
1849 //
1850 // [x-y for (x,y) in zip(zoverbar, x3zoverbar)][:15]
1851 // [0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1852 //
1853 // This pattern of differences is the same for all elements, although it
1854 // obviously moves right with the rotations.
1855 //
1856 // From this, we reach algorithm eight of appendix B.
1857
1858 // Handle the first three elements of the inner products.
1859 out->v[0] = a->v[0] + a->v[2];
1860 out->v[1] = a->v[1];
1861 out->v[2] = -a->v[0] + a->v[2];
1862
1863 // s0, s1, s2 are added into out->v[0], out->v[1], and out->v[2],
1864 // respectively. We do not compute s1 because it's just -(s0 + s1).
1865 uint16_t s0 = 0, s2 = 0;
1866 for (size_t i = 3; i < 699; i += 3) {
1867 s0 += -a->v[i] + a->v[i + 2];
1868 // s1 += a->v[i] - a->v[i + 1];
1869 s2 += a->v[i + 1] - a->v[i + 2];
1870 }
1871
1872 // Handle the fact that the three-element pattern doesn't fill the
1873 // polynomial exactly (since 701 isn't a multiple of three).
1874 s0 -= a->v[699];
1875 // s1 += a->v[699] - a->v[700];
1876 s2 += a->v[700];
1877
1878 // Note that s0 + s1 + s2 = 0.
1879 out->v[0] += s0;
1880 out->v[1] -= (s0 + s2); // = s1
1881 out->v[2] += s2;
1882
1883 // Calculate the remaining inner products by taking advantage of the
1884 // fact that the pattern repeats every three cycles and the pattern of
1885 // differences moves with the rotation.
1886 for (size_t i = 3; i < N; i++) {
1887 out->v[i] = (out->v[i - 3] - (a->v[i - 2] + a->v[i - 1] + a->v[i]));
1888 }
1889
1890 // Reduce mod Φ(N) by subtracting a multiple of out[700] from every
1891 // element and convert to mod Q. (See above about adding twice as
1892 // subtraction.)
1893 const crypto_word_t v = out->v[700];
1894 for (unsigned i = 0; i < N; i++) {
1895 const uint16_t vi_mod3 = mod3(out->v[i] - v);
1896 // Map {0, 1, 2} to {0, 1, 0xffff}.
1897 out->v[i] = (~((vi_mod3 >> 1) - 1)) | vi_mod3;
1898 }
1899
1900 poly_mul_x_minus_1(out);
Adam Langleyf7e1a942022-04-01 11:46:32 -07001901 poly_normalize(out);
Adam Langley7b935932018-11-12 13:53:42 -08001902}
1903
1904struct public_key {
1905 struct poly ph;
1906};
1907
1908struct private_key {
1909 struct poly3 f, f_inverse;
1910 struct poly ph_inverse;
1911 uint8_t hmac_key[32];
1912};
1913
1914// public_key_from_external converts an external public key pointer into an
1915// internal one. Externally the alignment is only specified to be eight bytes
1916// but we need 16-byte alignment. We could annotate the external struct with
1917// that alignment but we can only assume that malloced pointers are 8-byte
1918// aligned in any case. (Even if the underlying malloc returns values with
1919// 16-byte alignment, |OPENSSL_malloc| will store an 8-byte size prefix and mess
1920// that up.)
1921static struct public_key *public_key_from_external(
1922 struct HRSS_public_key *ext) {
David Benjaminb7d63202022-07-26 13:25:02 -07001923 static_assert(
Adam Langley7b935932018-11-12 13:53:42 -08001924 sizeof(struct HRSS_public_key) >= sizeof(struct public_key) + 15,
1925 "HRSS public key too small");
1926
David Benjaminecc301c2021-07-02 12:30:40 -04001927 return align_pointer(ext->opaque, 16);
Adam Langley7b935932018-11-12 13:53:42 -08001928}
1929
1930// private_key_from_external does the same thing as |public_key_from_external|,
1931// but for private keys. See the comment on that function about alignment
1932// issues.
1933static struct private_key *private_key_from_external(
1934 struct HRSS_private_key *ext) {
David Benjaminb7d63202022-07-26 13:25:02 -07001935 static_assert(
Adam Langley7b935932018-11-12 13:53:42 -08001936 sizeof(struct HRSS_private_key) >= sizeof(struct private_key) + 15,
1937 "HRSS private key too small");
1938
David Benjaminecc301c2021-07-02 12:30:40 -04001939 return align_pointer(ext->opaque, 16);
Adam Langley7b935932018-11-12 13:53:42 -08001940}
1941
Adam Langley71530132021-06-30 16:33:47 -07001942// malloc_align32 returns a pointer to |size| bytes of 32-byte-aligned heap and
1943// sets |*out_ptr| to a value that can be passed to |OPENSSL_free| to release
1944// it. It returns NULL if out of memory.
1945static void *malloc_align32(void **out_ptr, size_t size) {
1946 void *ptr = OPENSSL_malloc(size + 31);
1947 if (!ptr) {
1948 *out_ptr = NULL;
1949 return NULL;
1950 }
1951
1952 *out_ptr = ptr;
1953 return align_pointer(ptr, 32);
1954}
1955
1956int HRSS_generate_key(
Adam Langley7b935932018-11-12 13:53:42 -08001957 struct HRSS_public_key *out_pub, struct HRSS_private_key *out_priv,
1958 const uint8_t in[HRSS_SAMPLE_BYTES + HRSS_SAMPLE_BYTES + 32]) {
1959 struct public_key *pub = public_key_from_external(out_pub);
1960 struct private_key *priv = private_key_from_external(out_priv);
1961
Adam Langley71530132021-06-30 16:33:47 -07001962 struct vars {
1963 struct POLY_MUL_SCRATCH scratch;
1964 struct poly f;
1965 struct poly pg_phi1;
1966 struct poly pfg_phi1;
1967 struct poly pfg_phi1_inverse;
1968 };
1969
1970 void *malloc_ptr;
1971 struct vars *const vars = malloc_align32(&malloc_ptr, sizeof(struct vars));
1972 if (!vars) {
1973 // If the caller ignores the return value the output will still be safe.
1974 // The private key output is randomised in case it's later passed to
1975 // |HRSS_encap|.
1976 memset(out_pub, 0, sizeof(struct HRSS_public_key));
1977 RAND_bytes((uint8_t*) out_priv, sizeof(struct HRSS_private_key));
1978 return 0;
1979 }
1980
Adam Langleyf7e1a942022-04-01 11:46:32 -07001981#if !defined(NDEBUG)
1982 OPENSSL_memset(vars, 0xff, sizeof(struct vars));
1983#endif
1984
Adam Langley7b935932018-11-12 13:53:42 -08001985 OPENSSL_memcpy(priv->hmac_key, in + 2 * HRSS_SAMPLE_BYTES,
1986 sizeof(priv->hmac_key));
1987
Adam Langley71530132021-06-30 16:33:47 -07001988 poly_short_sample_plus(&vars->f, in);
1989 poly3_from_poly(&priv->f, &vars->f);
Adam Langley7b935932018-11-12 13:53:42 -08001990 HRSS_poly3_invert(&priv->f_inverse, &priv->f);
1991
1992 // pg_phi1 is p (i.e. 3) × g × Φ(1) (i.e. 𝑥-1).
Adam Langley71530132021-06-30 16:33:47 -07001993 poly_short_sample_plus(&vars->pg_phi1, in + HRSS_SAMPLE_BYTES);
Adam Langley7b935932018-11-12 13:53:42 -08001994 for (unsigned i = 0; i < N; i++) {
Adam Langley71530132021-06-30 16:33:47 -07001995 vars->pg_phi1.v[i] *= 3;
Adam Langley7b935932018-11-12 13:53:42 -08001996 }
Adam Langley71530132021-06-30 16:33:47 -07001997 poly_mul_x_minus_1(&vars->pg_phi1);
Adam Langley7b935932018-11-12 13:53:42 -08001998
Adam Langley71530132021-06-30 16:33:47 -07001999 poly_mul(&vars->scratch, &vars->pfg_phi1, &vars->f, &vars->pg_phi1);
Adam Langley7b935932018-11-12 13:53:42 -08002000
Adam Langley71530132021-06-30 16:33:47 -07002001 poly_invert(&vars->scratch, &vars->pfg_phi1_inverse, &vars->pfg_phi1);
Adam Langley7b935932018-11-12 13:53:42 -08002002
Adam Langley71530132021-06-30 16:33:47 -07002003 poly_mul(&vars->scratch, &pub->ph, &vars->pfg_phi1_inverse, &vars->pg_phi1);
2004 poly_mul(&vars->scratch, &pub->ph, &pub->ph, &vars->pg_phi1);
Adam Langley7b935932018-11-12 13:53:42 -08002005 poly_clamp(&pub->ph);
2006
Adam Langley71530132021-06-30 16:33:47 -07002007 poly_mul(&vars->scratch, &priv->ph_inverse, &vars->pfg_phi1_inverse,
2008 &vars->f);
2009 poly_mul(&vars->scratch, &priv->ph_inverse, &priv->ph_inverse, &vars->f);
Adam Langley7b935932018-11-12 13:53:42 -08002010 poly_clamp(&priv->ph_inverse);
Adam Langley71530132021-06-30 16:33:47 -07002011
2012 OPENSSL_free(malloc_ptr);
2013 return 1;
Adam Langley7b935932018-11-12 13:53:42 -08002014}
2015
Adam Langley7b935932018-11-12 13:53:42 -08002016static const char kSharedKey[] = "shared key";
2017
Adam Langley71530132021-06-30 16:33:47 -07002018int HRSS_encap(uint8_t out_ciphertext[POLY_BYTES], uint8_t out_shared_key[32],
2019 const struct HRSS_public_key *in_pub,
2020 const uint8_t in[HRSS_SAMPLE_BYTES + HRSS_SAMPLE_BYTES]) {
Adam Langley7b935932018-11-12 13:53:42 -08002021 const struct public_key *pub =
2022 public_key_from_external((struct HRSS_public_key *)in_pub);
Adam Langley9700b442018-12-17 11:17:26 -08002023
Adam Langley71530132021-06-30 16:33:47 -07002024 struct vars {
2025 struct POLY_MUL_SCRATCH scratch;
2026 struct poly m, r, m_lifted;
2027 struct poly prh_plus_m;
2028 SHA256_CTX hash_ctx;
2029 uint8_t m_bytes[HRSS_POLY3_BYTES];
2030 uint8_t r_bytes[HRSS_POLY3_BYTES];
2031 };
2032
2033 void *malloc_ptr;
2034 struct vars *const vars = malloc_align32(&malloc_ptr, sizeof(struct vars));
2035 if (!vars) {
2036 // If the caller ignores the return value the output will still be safe.
2037 // The private key output is randomised in case it's used to encrypt and
2038 // transmit something.
2039 memset(out_ciphertext, 0, POLY_BYTES);
2040 RAND_bytes(out_shared_key, 32);
2041 return 0;
Adam Langley9700b442018-12-17 11:17:26 -08002042 }
2043
Adam Langleyf7e1a942022-04-01 11:46:32 -07002044#if !defined(NDEBUG)
2045 OPENSSL_memset(vars, 0xff, sizeof(struct vars));
2046#endif
2047
Adam Langley71530132021-06-30 16:33:47 -07002048 poly_short_sample(&vars->m, in);
2049 poly_short_sample(&vars->r, in + HRSS_SAMPLE_BYTES);
2050 poly_lift(&vars->m_lifted, &vars->m);
Adam Langley7b935932018-11-12 13:53:42 -08002051
Adam Langley71530132021-06-30 16:33:47 -07002052 poly_mul(&vars->scratch, &vars->prh_plus_m, &vars->r, &pub->ph);
2053 for (unsigned i = 0; i < N; i++) {
2054 vars->prh_plus_m.v[i] += vars->m_lifted.v[i];
2055 }
Adam Langley7b935932018-11-12 13:53:42 -08002056
Adam Langley71530132021-06-30 16:33:47 -07002057 poly_marshal(out_ciphertext, &vars->prh_plus_m);
2058
2059 poly_marshal_mod3(vars->m_bytes, &vars->m);
2060 poly_marshal_mod3(vars->r_bytes, &vars->r);
2061
2062 SHA256_Init(&vars->hash_ctx);
2063 SHA256_Update(&vars->hash_ctx, kSharedKey, sizeof(kSharedKey));
2064 SHA256_Update(&vars->hash_ctx, vars->m_bytes, sizeof(vars->m_bytes));
2065 SHA256_Update(&vars->hash_ctx, vars->r_bytes, sizeof(vars->r_bytes));
2066 SHA256_Update(&vars->hash_ctx, out_ciphertext, POLY_BYTES);
2067 SHA256_Final(out_shared_key, &vars->hash_ctx);
2068
2069 OPENSSL_free(malloc_ptr);
2070 return 1;
Adam Langley7b935932018-11-12 13:53:42 -08002071}
2072
Adam Langley71530132021-06-30 16:33:47 -07002073int HRSS_decap(uint8_t out_shared_key[HRSS_KEY_BYTES],
Adam Langley7b935932018-11-12 13:53:42 -08002074 const struct HRSS_private_key *in_priv,
2075 const uint8_t *ciphertext, size_t ciphertext_len) {
Adam Langley7b935932018-11-12 13:53:42 -08002076 const struct private_key *priv =
2077 private_key_from_external((struct HRSS_private_key *)in_priv);
2078
Adam Langley71530132021-06-30 16:33:47 -07002079 struct vars {
2080 struct POLY_MUL_SCRATCH scratch;
2081 uint8_t masked_key[SHA256_CBLOCK];
2082 SHA256_CTX hash_ctx;
2083 struct poly c;
2084 struct poly f, cf;
2085 struct poly3 cf3, m3;
2086 struct poly m, m_lifted;
2087 struct poly r;
2088 struct poly3 r3;
2089 uint8_t expected_ciphertext[HRSS_CIPHERTEXT_BYTES];
2090 uint8_t m_bytes[HRSS_POLY3_BYTES];
2091 uint8_t r_bytes[HRSS_POLY3_BYTES];
2092 uint8_t shared_key[32];
2093 };
2094
2095 void *malloc_ptr;
2096 struct vars *const vars = malloc_align32(&malloc_ptr, sizeof(struct vars));
2097 if (!vars) {
2098 // If the caller ignores the return value the output will still be safe.
2099 // The private key output is randomised in case it's used to encrypt and
2100 // transmit something.
2101 RAND_bytes(out_shared_key, HRSS_KEY_BYTES);
2102 return 0;
2103 }
2104
Adam Langleyf7e1a942022-04-01 11:46:32 -07002105#if !defined(NDEBUG)
2106 OPENSSL_memset(vars, 0xff, sizeof(struct vars));
2107#endif
2108
Adam Langley7b935932018-11-12 13:53:42 -08002109 // This is HMAC, expanded inline rather than using the |HMAC| function so that
2110 // we can avoid dealing with possible allocation failures and so keep this
2111 // function infallible.
David Benjaminb7d63202022-07-26 13:25:02 -07002112 static_assert(sizeof(priv->hmac_key) <= sizeof(vars->masked_key),
2113 "HRSS HMAC key larger than SHA-256 block size");
Adam Langley7b935932018-11-12 13:53:42 -08002114 for (size_t i = 0; i < sizeof(priv->hmac_key); i++) {
Adam Langley71530132021-06-30 16:33:47 -07002115 vars->masked_key[i] = priv->hmac_key[i] ^ 0x36;
Adam Langley7b935932018-11-12 13:53:42 -08002116 }
Adam Langley71530132021-06-30 16:33:47 -07002117 OPENSSL_memset(vars->masked_key + sizeof(priv->hmac_key), 0x36,
2118 sizeof(vars->masked_key) - sizeof(priv->hmac_key));
Adam Langley7b935932018-11-12 13:53:42 -08002119
Adam Langley71530132021-06-30 16:33:47 -07002120 SHA256_Init(&vars->hash_ctx);
2121 SHA256_Update(&vars->hash_ctx, vars->masked_key, sizeof(vars->masked_key));
2122 SHA256_Update(&vars->hash_ctx, ciphertext, ciphertext_len);
Adam Langley7b935932018-11-12 13:53:42 -08002123 uint8_t inner_digest[SHA256_DIGEST_LENGTH];
Adam Langley71530132021-06-30 16:33:47 -07002124 SHA256_Final(inner_digest, &vars->hash_ctx);
Adam Langley7b935932018-11-12 13:53:42 -08002125
2126 for (size_t i = 0; i < sizeof(priv->hmac_key); i++) {
Adam Langley71530132021-06-30 16:33:47 -07002127 vars->masked_key[i] ^= (0x5c ^ 0x36);
Adam Langley7b935932018-11-12 13:53:42 -08002128 }
Adam Langley71530132021-06-30 16:33:47 -07002129 OPENSSL_memset(vars->masked_key + sizeof(priv->hmac_key), 0x5c,
2130 sizeof(vars->masked_key) - sizeof(priv->hmac_key));
Adam Langley7b935932018-11-12 13:53:42 -08002131
Adam Langley71530132021-06-30 16:33:47 -07002132 SHA256_Init(&vars->hash_ctx);
2133 SHA256_Update(&vars->hash_ctx, vars->masked_key, sizeof(vars->masked_key));
2134 SHA256_Update(&vars->hash_ctx, inner_digest, sizeof(inner_digest));
David Benjaminb7d63202022-07-26 13:25:02 -07002135 static_assert(HRSS_KEY_BYTES == SHA256_DIGEST_LENGTH,
2136 "HRSS shared key length incorrect");
Adam Langley71530132021-06-30 16:33:47 -07002137 SHA256_Final(out_shared_key, &vars->hash_ctx);
Adam Langley7b935932018-11-12 13:53:42 -08002138
2139 // If the ciphertext is publicly invalid then a random shared key is still
2140 // returned to simply the logic of the caller, but this path is not constant
2141 // time.
Adam Langleyf8068ce2018-12-17 11:16:23 -08002142 if (ciphertext_len != HRSS_CIPHERTEXT_BYTES ||
Adam Langley71530132021-06-30 16:33:47 -07002143 !poly_unmarshal(&vars->c, ciphertext)) {
2144 goto out;
Adam Langley7b935932018-11-12 13:53:42 -08002145 }
2146
Adam Langley71530132021-06-30 16:33:47 -07002147 poly_from_poly3(&vars->f, &priv->f);
2148 poly_mul(&vars->scratch, &vars->cf, &vars->c, &vars->f);
2149 poly3_from_poly(&vars->cf3, &vars->cf);
Adam Langley7b935932018-11-12 13:53:42 -08002150 // Note that cf3 is not reduced mod Φ(N). That reduction is deferred.
Adam Langley71530132021-06-30 16:33:47 -07002151 HRSS_poly3_mul(&vars->m3, &vars->cf3, &priv->f_inverse);
Adam Langley7b935932018-11-12 13:53:42 -08002152
Adam Langley71530132021-06-30 16:33:47 -07002153 poly_from_poly3(&vars->m, &vars->m3);
2154 poly_lift(&vars->m_lifted, &vars->m);
Adam Langley7b935932018-11-12 13:53:42 -08002155
2156 for (unsigned i = 0; i < N; i++) {
Adam Langley71530132021-06-30 16:33:47 -07002157 vars->r.v[i] = vars->c.v[i] - vars->m_lifted.v[i];
Adam Langley7b935932018-11-12 13:53:42 -08002158 }
Adam Langleyf7e1a942022-04-01 11:46:32 -07002159 poly_normalize(&vars->r);
Adam Langley71530132021-06-30 16:33:47 -07002160 poly_mul(&vars->scratch, &vars->r, &vars->r, &priv->ph_inverse);
2161 poly_mod_phiN(&vars->r);
2162 poly_clamp(&vars->r);
Adam Langley7b935932018-11-12 13:53:42 -08002163
Adam Langley71530132021-06-30 16:33:47 -07002164 crypto_word_t ok = poly3_from_poly_checked(&vars->r3, &vars->r);
Adam Langley9700b442018-12-17 11:17:26 -08002165
2166 // [NTRUCOMP] section 5.1 includes ReEnc2 and a proof that it's valid. Rather
2167 // than do an expensive |poly_mul|, it rebuilds |c'| from |c - lift(m)|
2168 // (called |b|) with:
2169 // t = (−b(1)/N) mod Q
2170 // c' = b + tΦ(N) + lift(m) mod Q
2171 //
2172 // When polynomials are transmitted, the final coefficient is omitted and
2173 // |poly_unmarshal| sets it such that f(1) == 0. Thus c(1) == 0. Also,
2174 // |poly_lift| multiplies the result by (x-1) and therefore evaluating a
2175 // lifted polynomial at 1 is also zero. Thus lift(m)(1) == 0 and so
2176 // (c - lift(m))(1) == 0.
2177 //
2178 // Although we defer the reduction above, |b| is conceptually reduced mod
2179 // Φ(N). In order to do that reduction one subtracts |c[N-1]| from every
2180 // coefficient. Therefore b(1) = -c[N-1]×N. The value of |t|, above, then is
2181 // just recovering |c[N-1]|, and adding tΦ(N) is simply undoing the reduction.
2182 // Therefore b + tΦ(N) + lift(m) = c by construction and we don't need to
2183 // recover |c| at all so long as we do the checks in
2184 // |poly3_from_poly_checked|.
2185 //
2186 // The |poly_marshal| here then is just confirming that |poly_unmarshal| is
2187 // strict and could be omitted.
Adam Langley7b935932018-11-12 13:53:42 -08002188
David Benjaminb7d63202022-07-26 13:25:02 -07002189 static_assert(HRSS_CIPHERTEXT_BYTES == POLY_BYTES,
2190 "ciphertext is the wrong size");
Adam Langley71530132021-06-30 16:33:47 -07002191 assert(ciphertext_len == sizeof(vars->expected_ciphertext));
2192 poly_marshal(vars->expected_ciphertext, &vars->c);
Adam Langley7b935932018-11-12 13:53:42 -08002193
Adam Langley71530132021-06-30 16:33:47 -07002194 poly_marshal_mod3(vars->m_bytes, &vars->m);
2195 poly_marshal_mod3(vars->r_bytes, &vars->r);
Adam Langley7b935932018-11-12 13:53:42 -08002196
Adam Langley71530132021-06-30 16:33:47 -07002197 ok &= constant_time_is_zero_w(
2198 CRYPTO_memcmp(ciphertext, vars->expected_ciphertext,
2199 sizeof(vars->expected_ciphertext)));
Adam Langley7b935932018-11-12 13:53:42 -08002200
Adam Langley71530132021-06-30 16:33:47 -07002201 SHA256_Init(&vars->hash_ctx);
2202 SHA256_Update(&vars->hash_ctx, kSharedKey, sizeof(kSharedKey));
2203 SHA256_Update(&vars->hash_ctx, vars->m_bytes, sizeof(vars->m_bytes));
2204 SHA256_Update(&vars->hash_ctx, vars->r_bytes, sizeof(vars->r_bytes));
2205 SHA256_Update(&vars->hash_ctx, vars->expected_ciphertext,
2206 sizeof(vars->expected_ciphertext));
2207 SHA256_Final(vars->shared_key, &vars->hash_ctx);
Adam Langley7b935932018-11-12 13:53:42 -08002208
Adam Langley71530132021-06-30 16:33:47 -07002209 for (unsigned i = 0; i < sizeof(vars->shared_key); i++) {
Adam Langley7b935932018-11-12 13:53:42 -08002210 out_shared_key[i] =
Adam Langley71530132021-06-30 16:33:47 -07002211 constant_time_select_8(ok, vars->shared_key[i], out_shared_key[i]);
Adam Langley7b935932018-11-12 13:53:42 -08002212 }
Adam Langley71530132021-06-30 16:33:47 -07002213
2214out:
2215 OPENSSL_free(malloc_ptr);
2216 return 1;
Adam Langley7b935932018-11-12 13:53:42 -08002217}
2218
2219void HRSS_marshal_public_key(uint8_t out[HRSS_PUBLIC_KEY_BYTES],
2220 const struct HRSS_public_key *in_pub) {
2221 const struct public_key *pub =
2222 public_key_from_external((struct HRSS_public_key *)in_pub);
2223 poly_marshal(out, &pub->ph);
2224}
2225
2226int HRSS_parse_public_key(struct HRSS_public_key *out,
2227 const uint8_t in[HRSS_PUBLIC_KEY_BYTES]) {
2228 struct public_key *pub = public_key_from_external(out);
Adam Langleyf8068ce2018-12-17 11:16:23 -08002229 if (!poly_unmarshal(&pub->ph, in)) {
2230 return 0;
2231 }
Adam Langley7b935932018-11-12 13:53:42 -08002232 OPENSSL_memset(&pub->ph.v[N], 0, 3 * sizeof(uint16_t));
2233 return 1;
2234}