Blame - crypto/hrss/hrss.c - boringssl.git

blob: 572e9817c923a6738aec3c28f1cc424cf224df23 [file] [log] [blame]

Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1	/* Copyright (c) 2018, Google Inc.
				2	*
				3	* Permission to use, copy, modify, and/or distribute this software for any
				4	* purpose with or without fee is hereby granted, provided that the above
				5	* copyright notice and this permission notice appear in all copies.
				6	*
				7	* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
				8	* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
				9	* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
				10	* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				11	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
				12	* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
				13	* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
				14
				15	#include <openssl/hrss.h>
				16
				17	#include <assert.h>
				18	#include <stdio.h>
				19	#include <stdlib.h>
				20
				21	#include <openssl/bn.h>
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	22	#include <openssl/hmac.h>
				23	#include <openssl/mem.h>
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	24	#include <openssl/rand.h>
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	25	#include <openssl/sha.h>
				26
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	27	#if defined(_MSC_VER)
				28	#define RESTRICT
				29	#else
				30	#define RESTRICT restrict
				31	#endif
				32
				33	#include "../internal.h"
				34	#include "internal.h"
				35
David Benjamin	6887d5e	2019-12-17 13:56:30 -0500	[diff] [blame]	36	#if defined(OPENSSL_SSE2)
				37	#include <emmintrin.h>
				38	#endif
				39
David Benjamin	846a227	2022-01-06 11:12:06 -0500	[diff] [blame]	40	#if (defined(OPENSSL_ARM) \|\| defined(OPENSSL_AARCH64)) && defined(__ARM_NEON)
David Benjamin	6887d5e	2019-12-17 13:56:30 -0500	[diff] [blame]	41	#include <arm_neon.h>
				42	#endif
				43
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	44	// This is an implementation of [HRSS], but with a KEM transformation based on
				45	// [SXY]. The primary references are:
				46
				47	// HRSS: https://eprint.iacr.org/2017/667.pdf
				48	// HRSSNIST:
				49	// https://csrc.nist.gov/CSRC/media/Projects/Post-Quantum-Cryptography/documents/round-1/submissions/NTRU_HRSS_KEM.zip
				50	// SXY: https://eprint.iacr.org/2017/1005.pdf
				51	// NTRUTN14:
				52	// https://assets.onboardsecurity.com/static/downloads/NTRU/resources/NTRUTech014.pdf
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	53	// NTRUCOMP: https://eprint.iacr.org/2018/1174
				54	// SAFEGCD: https://gcd.cr.yp.to/papers.html#safegcd
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	55
				56
				57	// Vector operations.
				58	//
				59	// A couple of functions in this file can use vector operations to meaningful
				60	// effect. If we're building for a target that has a supported vector unit,
				61	// \|HRSS_HAVE_VECTOR_UNIT\| will be defined and \|vec_t\| will be typedefed to a
				62	// 128-bit vector. The following functions abstract over the differences between
				63	// NEON and SSE2 for implementing some vector operations.
				64
David Benjamin	6887d5e	2019-12-17 13:56:30 -0500	[diff] [blame]	65	// TODO: MSVC can likely also be made to work with vector operations, but ^ must
				66	// be replaced with _mm_xor_si128, etc.
				67	#if defined(OPENSSL_SSE2) && (defined(__clang__) \|\| !defined(_MSC_VER))
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	68
				69	#define HRSS_HAVE_VECTOR_UNIT
				70	typedef __m128i vec_t;
				71
				72	// vec_capable returns one iff the current platform supports SSE2.
David Benjamin	6887d5e	2019-12-17 13:56:30 -0500	[diff] [blame]	73	static int vec_capable(void) { return 1; }
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	74
				75	// vec_add performs a pair-wise addition of four uint16s from \|a\| and \|b\|.
				76	static inline vec_t vec_add(vec_t a, vec_t b) { return _mm_add_epi16(a, b); }
				77
				78	// vec_sub performs a pair-wise subtraction of four uint16s from \|a\| and \|b\|.
				79	static inline vec_t vec_sub(vec_t a, vec_t b) { return _mm_sub_epi16(a, b); }
				80
				81	// vec_mul multiplies each uint16_t in \|a\| by \|b\| and returns the resulting
				82	// vector.
				83	static inline vec_t vec_mul(vec_t a, uint16_t b) {
				84	return _mm_mullo_epi16(a, _mm_set1_epi16(b));
				85	}
				86
				87	// vec_fma multiplies each uint16_t in \|b\| by \|c\|, adds the result to \|a\|, and
				88	// returns the resulting vector.
				89	static inline vec_t vec_fma(vec_t a, vec_t b, uint16_t c) {
				90	return _mm_add_epi16(a, _mm_mullo_epi16(b, _mm_set1_epi16(c)));
				91	}
				92
				93	// vec3_rshift_word right-shifts the 24 uint16_t's in \|v\| by one uint16.
				94	static inline void vec3_rshift_word(vec_t v[3]) {
				95	// Intel's left and right shifting is backwards compared to the order in
				96	// memory because they're based on little-endian order of words (and not just
				97	// bytes). So the shifts in this function will be backwards from what one
				98	// might expect.
				99	const __m128i carry0 = _mm_srli_si128(v[0], 14);
				100	v[0] = _mm_slli_si128(v[0], 2);
				101
				102	const __m128i carry1 = _mm_srli_si128(v[1], 14);
				103	v[1] = _mm_slli_si128(v[1], 2);
				104	v[1] \|= carry0;
				105
				106	v[2] = _mm_slli_si128(v[2], 2);
				107	v[2] \|= carry1;
				108	}
				109
				110	// vec4_rshift_word right-shifts the 32 uint16_t's in \|v\| by one uint16.
				111	static inline void vec4_rshift_word(vec_t v[4]) {
				112	// Intel's left and right shifting is backwards compared to the order in
				113	// memory because they're based on little-endian order of words (and not just
				114	// bytes). So the shifts in this function will be backwards from what one
				115	// might expect.
				116	const __m128i carry0 = _mm_srli_si128(v[0], 14);
				117	v[0] = _mm_slli_si128(v[0], 2);
				118
				119	const __m128i carry1 = _mm_srli_si128(v[1], 14);
				120	v[1] = _mm_slli_si128(v[1], 2);
				121	v[1] \|= carry0;
				122
				123	const __m128i carry2 = _mm_srli_si128(v[2], 14);
				124	v[2] = _mm_slli_si128(v[2], 2);
				125	v[2] \|= carry1;
				126
				127	v[3] = _mm_slli_si128(v[3], 2);
				128	v[3] \|= carry2;
				129	}
				130
				131	// vec_merge_3_5 takes the final three uint16_t's from \|left\|, appends the first
				132	// five from \|right\|, and returns the resulting vector.
				133	static inline vec_t vec_merge_3_5(vec_t left, vec_t right) {
				134	return _mm_srli_si128(left, 10) \| _mm_slli_si128(right, 6);
				135	}
				136
				137	// poly3_vec_lshift1 left-shifts the 768 bits in \|a_s\|, and in \|a_a\|, by one
				138	// bit.
				139	static inline void poly3_vec_lshift1(vec_t a_s[6], vec_t a_a[6]) {
				140	vec_t carry_s = {0};
				141	vec_t carry_a = {0};
				142
				143	for (int i = 0; i < 6; i++) {
				144	vec_t next_carry_s = _mm_srli_epi64(a_s[i], 63);
				145	a_s[i] = _mm_slli_epi64(a_s[i], 1);
				146	a_s[i] \|= _mm_slli_si128(next_carry_s, 8);
				147	a_s[i] \|= carry_s;
				148	carry_s = _mm_srli_si128(next_carry_s, 8);
				149
				150	vec_t next_carry_a = _mm_srli_epi64(a_a[i], 63);
				151	a_a[i] = _mm_slli_epi64(a_a[i], 1);
				152	a_a[i] \|= _mm_slli_si128(next_carry_a, 8);
				153	a_a[i] \|= carry_a;
				154	carry_a = _mm_srli_si128(next_carry_a, 8);
				155	}
				156	}
				157
				158	// poly3_vec_rshift1 right-shifts the 768 bits in \|a_s\|, and in \|a_a\|, by one
				159	// bit.
				160	static inline void poly3_vec_rshift1(vec_t a_s[6], vec_t a_a[6]) {
				161	vec_t carry_s = {0};
				162	vec_t carry_a = {0};
				163
				164	for (int i = 5; i >= 0; i--) {
				165	const vec_t next_carry_s = _mm_slli_epi64(a_s[i], 63);
				166	a_s[i] = _mm_srli_epi64(a_s[i], 1);
				167	a_s[i] \|= _mm_srli_si128(next_carry_s, 8);
				168	a_s[i] \|= carry_s;
				169	carry_s = _mm_slli_si128(next_carry_s, 8);
				170
				171	const vec_t next_carry_a = _mm_slli_epi64(a_a[i], 63);
				172	a_a[i] = _mm_srli_epi64(a_a[i], 1);
				173	a_a[i] \|= _mm_srli_si128(next_carry_a, 8);
				174	a_a[i] \|= carry_a;
				175	carry_a = _mm_slli_si128(next_carry_a, 8);
				176	}
				177	}
				178
				179	// vec_broadcast_bit duplicates the least-significant bit in \|a\| to all bits in
				180	// a vector and returns the result.
				181	static inline vec_t vec_broadcast_bit(vec_t a) {
				182	return _mm_shuffle_epi32(_mm_srai_epi32(_mm_slli_epi64(a, 63), 31),
				183	0b01010101);
				184	}
				185
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	186	// vec_get_word returns the \|i\|th uint16_t in \|v\|. (This is a macro because the
				187	// compiler requires that \|i\| be a compile-time constant.)
				188	#define vec_get_word(v, i) _mm_extract_epi16(v, i)
				189
David Benjamin	846a227	2022-01-06 11:12:06 -0500	[diff] [blame]	190	#elif (defined(OPENSSL_ARM) \|\| defined(OPENSSL_AARCH64)) && defined(__ARM_NEON)
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	191
				192	#define HRSS_HAVE_VECTOR_UNIT
				193	typedef uint16x8_t vec_t;
				194
				195	// These functions perform the same actions as the SSE2 function of the same
				196	// name, above.
				197
Alessandro Ghedini	9b0970f	2018-12-14 16:43:36 +0000	[diff] [blame]	198	static int vec_capable(void) { return CRYPTO_is_NEON_capable(); }
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	199
				200	static inline vec_t vec_add(vec_t a, vec_t b) { return a + b; }
				201
				202	static inline vec_t vec_sub(vec_t a, vec_t b) { return a - b; }
				203
				204	static inline vec_t vec_mul(vec_t a, uint16_t b) { return vmulq_n_u16(a, b); }
				205
				206	static inline vec_t vec_fma(vec_t a, vec_t b, uint16_t c) {
				207	return vmlaq_n_u16(a, b, c);
				208	}
				209
				210	static inline void vec3_rshift_word(vec_t v[3]) {
				211	const uint16x8_t kZero = {0};
				212	v[2] = vextq_u16(v[1], v[2], 7);
				213	v[1] = vextq_u16(v[0], v[1], 7);
				214	v[0] = vextq_u16(kZero, v[0], 7);
				215	}
				216
				217	static inline void vec4_rshift_word(vec_t v[4]) {
				218	const uint16x8_t kZero = {0};
				219	v[3] = vextq_u16(v[2], v[3], 7);
				220	v[2] = vextq_u16(v[1], v[2], 7);
				221	v[1] = vextq_u16(v[0], v[1], 7);
				222	v[0] = vextq_u16(kZero, v[0], 7);
				223	}
				224
				225	static inline vec_t vec_merge_3_5(vec_t left, vec_t right) {
				226	return vextq_u16(left, right, 5);
				227	}
				228
				229	static inline uint16_t vec_get_word(vec_t v, unsigned i) {
				230	return v[i];
				231	}
				232
				233	#if !defined(OPENSSL_AARCH64)
				234
				235	static inline vec_t vec_broadcast_bit(vec_t a) {
				236	a = (vec_t)vshrq_n_s16(((int16x8_t)a) << 15, 15);
				237	return vdupq_lane_u16(vget_low_u16(a), 0);
				238	}
				239
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	240	static inline void poly3_vec_lshift1(vec_t a_s[6], vec_t a_a[6]) {
				241	vec_t carry_s = {0};
				242	vec_t carry_a = {0};
				243	const vec_t kZero = {0};
				244
				245	for (int i = 0; i < 6; i++) {
				246	vec_t next_carry_s = a_s[i] >> 15;
				247	a_s[i] <<= 1;
				248	a_s[i] \|= vextq_u16(kZero, next_carry_s, 7);
				249	a_s[i] \|= carry_s;
				250	carry_s = vextq_u16(next_carry_s, kZero, 7);
				251
				252	vec_t next_carry_a = a_a[i] >> 15;
				253	a_a[i] <<= 1;
				254	a_a[i] \|= vextq_u16(kZero, next_carry_a, 7);
				255	a_a[i] \|= carry_a;
				256	carry_a = vextq_u16(next_carry_a, kZero, 7);
				257	}
				258	}
				259
				260	static inline void poly3_vec_rshift1(vec_t a_s[6], vec_t a_a[6]) {
				261	vec_t carry_s = {0};
				262	vec_t carry_a = {0};
				263	const vec_t kZero = {0};
				264
				265	for (int i = 5; i >= 0; i--) {
				266	vec_t next_carry_s = a_s[i] << 15;
				267	a_s[i] >>= 1;
				268	a_s[i] \|= vextq_u16(next_carry_s, kZero, 1);
				269	a_s[i] \|= carry_s;
				270	carry_s = vextq_u16(kZero, next_carry_s, 1);
				271
				272	vec_t next_carry_a = a_a[i] << 15;
				273	a_a[i] >>= 1;
				274	a_a[i] \|= vextq_u16(next_carry_a, kZero, 1);
				275	a_a[i] \|= carry_a;
				276	carry_a = vextq_u16(kZero, next_carry_a, 1);
				277	}
				278	}
				279
				280	#endif // !OPENSSL_AARCH64
				281
				282	#endif // (ARM \|\| AARCH64) && NEON
				283
				284	// Polynomials in this scheme have N terms.
				285	// #define N 701
				286
				287	// Underlying data types and arithmetic operations.
				288	// ------------------------------------------------
				289
				290	// Binary polynomials.
				291
				292	// poly2 represents a degree-N polynomial over GF(2). The words are in little-
				293	// endian order, i.e. the coefficient of x^0 is the LSB of the first word. The
				294	// final word is only partially used since N is not a multiple of the word size.
				295
				296	// Defined in internal.h:
				297	// struct poly2 {
				298	// crypto_word_t v[WORDS_PER_POLY];
				299	// };
				300
				301	OPENSSL_UNUSED static void hexdump(const void *void_in, size_t len) {
				302	const uint8_t in = (const uint8_t )void_in;
				303	for (size_t i = 0; i < len; i++) {
				304	printf("%02x", in[i]);
				305	}
				306	printf("\n");
				307	}
				308
				309	static void poly2_zero(struct poly2 *p) {
				310	OPENSSL_memset(&p->v[0], 0, sizeof(crypto_word_t) * WORDS_PER_POLY);
				311	}
				312
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	313	// word_reverse returns \|in\| with the bits in reverse order.
				314	static crypto_word_t word_reverse(crypto_word_t in) {
				315	#if defined(OPENSSL_64_BIT)
				316	static const crypto_word_t kMasks[6] = {
				317	UINT64_C(0x5555555555555555),
				318	UINT64_C(0x3333333333333333),
				319	UINT64_C(0x0f0f0f0f0f0f0f0f),
				320	UINT64_C(0x00ff00ff00ff00ff),
				321	UINT64_C(0x0000ffff0000ffff),
				322	UINT64_C(0x00000000ffffffff),
				323	};
				324	#else
				325	static const crypto_word_t kMasks[5] = {
				326	0x55555555,
				327	0x33333333,
				328	0x0f0f0f0f,
				329	0x00ff00ff,
				330	0x0000ffff,
				331	};
				332	#endif
				333
				334	for (size_t i = 0; i < OPENSSL_ARRAY_SIZE(kMasks); i++) {
				335	in = ((in >> (1 << i)) & kMasks[i]) \| ((in & kMasks[i]) << (1 << i));
				336	}
				337
				338	return in;
				339	}
				340
				341	// lsb_to_all replicates the least-significant bit of \|v\| to all bits of the
				342	// word. This is used in bit-slicing operations to make a vector from a fixed
				343	// value.
				344	static crypto_word_t lsb_to_all(crypto_word_t v) { return 0u - (v & 1); }
				345
				346	// poly2_mod_phiN reduces \|p\| by Φ(N).
				347	static void poly2_mod_phiN(struct poly2 *p) {
				348	// m is the term at x^700, replicated to every bit.
				349	const crypto_word_t m =
				350	lsb_to_all(p->v[WORDS_PER_POLY - 1] >> (BITS_IN_LAST_WORD - 1));
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	351	for (size_t i = 0; i < WORDS_PER_POLY; i++) {
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	352	p->v[i] ^= m;
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	353	}
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	354	p->v[WORDS_PER_POLY - 1] &= (UINT64_C(1) << (BITS_IN_LAST_WORD - 1)) - 1;
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	355	}
				356
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	357	// poly2_reverse_700 reverses the order of the first 700 bits of \|in\| and writes
				358	// the result to \|out\|.
				359	static void poly2_reverse_700(struct poly2 out, const struct poly2 in) {
				360	struct poly2 t;
				361	for (size_t i = 0; i < WORDS_PER_POLY; i++) {
				362	t.v[i] = word_reverse(in->v[i]);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	363	}
				364
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	365	static const size_t shift = BITS_PER_WORD - ((N-1) % BITS_PER_WORD);
				366	for (size_t i = 0; i < WORDS_PER_POLY-1; i++) {
				367	out->v[i] = t.v[WORDS_PER_POLY-1-i] >> shift;
				368	out->v[i] \|= t.v[WORDS_PER_POLY-2-i] << (BITS_PER_WORD - shift);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	369	}
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	370	out->v[WORDS_PER_POLY-1] = t.v[0] >> shift;
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	371	}
				372
				373	// poly2_cswap exchanges the values of \|a\| and \|b\| if \|swap\| is all ones.
				374	static void poly2_cswap(struct poly2 a, struct poly2 b, crypto_word_t swap) {
				375	for (size_t i = 0; i < WORDS_PER_POLY; i++) {
				376	const crypto_word_t sum = swap & (a->v[i] ^ b->v[i]);
				377	a->v[i] ^= sum;
				378	b->v[i] ^= sum;
				379	}
				380	}
				381
				382	// poly2_fmadd sets \|out\| to \|out\| + \|in\| * m, where m is either
				383	// \|CONSTTIME_TRUE_W\| or \|CONSTTIME_FALSE_W\|.
				384	static void poly2_fmadd(struct poly2 out, const struct poly2 in,
				385	crypto_word_t m) {
				386	for (size_t i = 0; i < WORDS_PER_POLY; i++) {
				387	out->v[i] ^= in->v[i] & m;
				388	}
				389	}
				390
				391	// poly2_lshift1 left-shifts \|p\| by one bit.
				392	static void poly2_lshift1(struct poly2 *p) {
				393	crypto_word_t carry = 0;
				394	for (size_t i = 0; i < WORDS_PER_POLY; i++) {
				395	const crypto_word_t next_carry = p->v[i] >> (BITS_PER_WORD - 1);
				396	p->v[i] <<= 1;
				397	p->v[i] \|= carry;
				398	carry = next_carry;
				399	}
				400	}
				401
				402	// poly2_rshift1 right-shifts \|p\| by one bit.
				403	static void poly2_rshift1(struct poly2 *p) {
				404	crypto_word_t carry = 0;
				405	for (size_t i = WORDS_PER_POLY - 1; i < WORDS_PER_POLY; i--) {
				406	const crypto_word_t next_carry = p->v[i] & 1;
				407	p->v[i] >>= 1;
				408	p->v[i] \|= carry << (BITS_PER_WORD - 1);
				409	carry = next_carry;
				410	}
				411	}
				412
				413	// poly2_clear_top_bits clears the bits in the final word that are only for
				414	// alignment.
				415	static void poly2_clear_top_bits(struct poly2 *p) {
				416	p->v[WORDS_PER_POLY - 1] &= (UINT64_C(1) << BITS_IN_LAST_WORD) - 1;
				417	}
				418
				419	// poly2_top_bits_are_clear returns one iff the extra bits in the final words of
				420	// \|p\| are zero.
				421	static int poly2_top_bits_are_clear(const struct poly2 *p) {
				422	return (p->v[WORDS_PER_POLY - 1] &
				423	~((UINT64_C(1) << BITS_IN_LAST_WORD) - 1)) == 0;
				424	}
				425
				426	// Ternary polynomials.
				427
				428	// poly3 represents a degree-N polynomial over GF(3). Each coefficient is
				429	// bitsliced across the \|s\| and \|a\| arrays, like this:
				430	//
				431	// s \| a \| value
				432	// -----------------
				433	// 0 \| 0 \| 0
				434	// 0 \| 1 \| 1
Adam Langley	eadef47	2019-01-16 17:20:24 -0800	[diff] [blame]	435	// 1 \| 1 \| -1 (aka 2)
				436	// 1 \| 0 \| <invalid>
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	437	//
Adam Langley	eadef47	2019-01-16 17:20:24 -0800	[diff] [blame]	438	// ('s' is for sign, and 'a' is the absolute value.)
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	439	//
				440	// Once bitsliced as such, the following circuits can be used to implement
				441	// addition and multiplication mod 3:
				442	//
				443	// (s3, a3) = (s1, a1) × (s2, a2)
Adam Langley	eadef47	2019-01-16 17:20:24 -0800	[diff] [blame]	444	// a3 = a1 ∧ a2
				445	// s3 = (s1 ⊕ s2) ∧ a3
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	446	//
				447	// (s3, a3) = (s1, a1) + (s2, a2)
Adam Langley	eadef47	2019-01-16 17:20:24 -0800	[diff] [blame]	448	// t = s1 ⊕ a2
				449	// s3 = t ∧ (s2 ⊕ a1)
				450	// a3 = (a1 ⊕ a2) ∨ (t ⊕ s2)
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	451	//
Adam Langley	eadef47	2019-01-16 17:20:24 -0800	[diff] [blame]	452	// (s3, a3) = (s1, a1) - (s2, a2)
				453	// t = a1 ⊕ a2
				454	// s3 = (s1 ⊕ a2) ∧ (t ⊕ s2)
				455	// a3 = t ∨ (s1 ⊕ s2)
				456	//
				457	// Negating a value just involves XORing s by a.
				458	//
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	459	// struct poly3 {
				460	// struct poly2 s, a;
				461	// };
				462
				463	OPENSSL_UNUSED static void poly3_print(const struct poly3 *in) {
				464	struct poly3 p;
				465	OPENSSL_memcpy(&p, in, sizeof(p));
				466	p.s.v[WORDS_PER_POLY - 1] &= ((crypto_word_t)1 << BITS_IN_LAST_WORD) - 1;
				467	p.a.v[WORDS_PER_POLY - 1] &= ((crypto_word_t)1 << BITS_IN_LAST_WORD) - 1;
				468
				469	printf("{[");
				470	for (unsigned i = 0; i < WORDS_PER_POLY; i++) {
				471	if (i) {
				472	printf(" ");
				473	}
				474	printf(BN_HEX_FMT2, p.s.v[i]);
				475	}
				476	printf("] [");
				477	for (unsigned i = 0; i < WORDS_PER_POLY; i++) {
				478	if (i) {
				479	printf(" ");
				480	}
				481	printf(BN_HEX_FMT2, p.a.v[i]);
				482	}
				483	printf("]}\n");
				484	}
				485
				486	static void poly3_zero(struct poly3 *p) {
				487	poly2_zero(&p->s);
				488	poly2_zero(&p->a);
				489	}
				490
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	491	// poly3_reverse_700 reverses the order of the first 700 terms of \|in\| and
				492	// writes them to \|out\|.
				493	static void poly3_reverse_700(struct poly3 out, const struct poly3 in) {
				494	poly2_reverse_700(&out->a, &in->a);
				495	poly2_reverse_700(&out->s, &in->s);
				496	}
				497
				498	// poly3_word_mul sets (\|out_s\|, \|out_a\|) to (\|s1\|, \|a1\|) × (\|s2\|, \|a2\|).
Adam Langley	eadef47	2019-01-16 17:20:24 -0800	[diff] [blame]	499	static void poly3_word_mul(crypto_word_t out_s, crypto_word_t out_a,
				500	const crypto_word_t s1, const crypto_word_t a1,
				501	const crypto_word_t s2, const crypto_word_t a2) {
				502	*out_a = a1 & a2;
				503	out_s = (s1 ^ s2) & out_a;
				504	}
				505
				506	// poly3_word_add sets (\|out_s\|, \|out_a\|) to (\|s1\|, \|a1\|) + (\|s2\|, \|a2\|).
				507	static void poly3_word_add(crypto_word_t out_s, crypto_word_t out_a,
				508	const crypto_word_t s1, const crypto_word_t a1,
				509	const crypto_word_t s2, const crypto_word_t a2) {
				510	const crypto_word_t t = s1 ^ a2;
				511	*out_s = t & (s2 ^ a1);
				512	*out_a = (a1 ^ a2) \| (t ^ s2);
				513	}
				514
				515	// poly3_word_sub sets (\|out_s\|, \|out_a\|) to (\|s1\|, \|a1\|) - (\|s2\|, \|a2\|).
				516	static void poly3_word_sub(crypto_word_t out_s, crypto_word_t out_a,
				517	const crypto_word_t s1, const crypto_word_t a1,
				518	const crypto_word_t s2, const crypto_word_t a2) {
				519	const crypto_word_t t = a1 ^ a2;
				520	*out_s = (s1 ^ a2) & (t ^ s2);
				521	*out_a = t \| (s1 ^ s2);
				522	}
				523
Adam Langley	eadef47	2019-01-16 17:20:24 -0800	[diff] [blame]	524	// poly3_mul_const sets \|p\| to \|p\|×m, where m = (ms, ma).
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	525	static void poly3_mul_const(struct poly3 *p, crypto_word_t ms,
				526	crypto_word_t ma) {
				527	ms = lsb_to_all(ms);
				528	ma = lsb_to_all(ma);
				529
				530	for (size_t i = 0; i < WORDS_PER_POLY; i++) {
Adam Langley	eadef47	2019-01-16 17:20:24 -0800	[diff] [blame]	531	poly3_word_mul(&p->s.v[i], &p->a.v[i], p->s.v[i], p->a.v[i], ms, ma);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	532	}
				533	}
				534
Adam Langley	eadef47	2019-01-16 17:20:24 -0800	[diff] [blame]	535	// poly3_fmadd sets \|out\| to \|out\| - \|in\|×m, where m is (ms, ma).
				536	static void poly3_fmsub(struct poly3 *RESTRICT out,
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	537	const struct poly3 *RESTRICT in, crypto_word_t ms,
				538	crypto_word_t ma) {
Adam Langley	eadef47	2019-01-16 17:20:24 -0800	[diff] [blame]	539	crypto_word_t product_s, product_a;
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	540	for (size_t i = 0; i < WORDS_PER_POLY; i++) {
Adam Langley	eadef47	2019-01-16 17:20:24 -0800	[diff] [blame]	541	poly3_word_mul(&product_s, &product_a, in->s.v[i], in->a.v[i], ms, ma);
				542	poly3_word_sub(&out->s.v[i], &out->a.v[i], out->s.v[i], out->a.v[i],
				543	product_s, product_a);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	544	}
				545	}
				546
				547	// final_bit_to_all replicates the bit in the final position of the last word to
				548	// all the bits in the word.
				549	static crypto_word_t final_bit_to_all(crypto_word_t v) {
				550	return lsb_to_all(v >> (BITS_IN_LAST_WORD - 1));
				551	}
				552
				553	// poly3_top_bits_are_clear returns one iff the extra bits in the final words of
				554	// \|p\| are zero.
				555	OPENSSL_UNUSED static int poly3_top_bits_are_clear(const struct poly3 *p) {
				556	return poly2_top_bits_are_clear(&p->s) && poly2_top_bits_are_clear(&p->a);
				557	}
				558
				559	// poly3_mod_phiN reduces \|p\| by Φ(N).
				560	static void poly3_mod_phiN(struct poly3 *p) {
				561	// In order to reduce by Φ(N) we subtract by the value of the greatest
Adam Langley	eadef47	2019-01-16 17:20:24 -0800	[diff] [blame]	562	// coefficient.
				563	const crypto_word_t factor_s = final_bit_to_all(p->s.v[WORDS_PER_POLY - 1]);
				564	const crypto_word_t factor_a = final_bit_to_all(p->a.v[WORDS_PER_POLY - 1]);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	565
				566	for (size_t i = 0; i < WORDS_PER_POLY; i++) {
Adam Langley	eadef47	2019-01-16 17:20:24 -0800	[diff] [blame]	567	poly3_word_sub(&p->s.v[i], &p->a.v[i], p->s.v[i], p->a.v[i], factor_s,
				568	factor_a);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	569	}
				570
				571	poly2_clear_top_bits(&p->s);
				572	poly2_clear_top_bits(&p->a);
				573	}
				574
				575	static void poly3_cswap(struct poly3 a, struct poly3 b, crypto_word_t swap) {
				576	poly2_cswap(&a->s, &b->s, swap);
				577	poly2_cswap(&a->a, &b->a, swap);
				578	}
				579
				580	static void poly3_lshift1(struct poly3 *p) {
				581	poly2_lshift1(&p->s);
				582	poly2_lshift1(&p->a);
				583	}
				584
				585	static void poly3_rshift1(struct poly3 *p) {
				586	poly2_rshift1(&p->s);
				587	poly2_rshift1(&p->a);
				588	}
				589
				590	// poly3_span represents a pointer into a poly3.
				591	struct poly3_span {
				592	crypto_word_t *s;
				593	crypto_word_t *a;
				594	};
				595
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	596	// poly3_span_add adds \|n\| words of values from \|a\| and \|b\| and writes the
				597	// result to \|out\|.
				598	static void poly3_span_add(const struct poly3_span *out,
				599	const struct poly3_span *a,
				600	const struct poly3_span *b, size_t n) {
				601	for (size_t i = 0; i < n; i++) {
				602	poly3_word_add(&out->s[i], &out->a[i], a->s[i], a->a[i], b->s[i], b->a[i]);
				603	}
				604	}
				605
				606	// poly3_span_sub subtracts \|n\| words of \|b\| from \|n\| words of \|a\|.
				607	static void poly3_span_sub(const struct poly3_span *a,
				608	const struct poly3_span *b, size_t n) {
				609	for (size_t i = 0; i < n; i++) {
Adam Langley	eadef47	2019-01-16 17:20:24 -0800	[diff] [blame]	610	poly3_word_sub(&a->s[i], &a->a[i], a->s[i], a->a[i], b->s[i], b->a[i]);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	611	}
				612	}
				613
				614	// poly3_mul_aux is a recursive function that multiplies \|n\| words from \|a\| and
				615	// \|b\| and writes 2×\|n\| words to \|out\|. Each call uses 2*ceil(n/2) elements of
				616	// \|scratch\| and the function recurses, except if \|n\| == 1, when \|scratch\| isn't
				617	// used and the recursion stops. For \|n\| in {11, 22}, the transitive total
				618	// amount of \|scratch\| needed happens to be 2n+2.
				619	static void poly3_mul_aux(const struct poly3_span *out,
				620	const struct poly3_span *scratch,
				621	const struct poly3_span *a,
				622	const struct poly3_span *b, size_t n) {
				623	if (n == 1) {
				624	crypto_word_t r_s_low = 0, r_s_high = 0, r_a_low = 0, r_a_high = 0;
				625	crypto_word_t b_s = b->s[0], b_a = b->a[0];
				626	const crypto_word_t a_s = a->s[0], a_a = a->a[0];
				627
				628	for (size_t i = 0; i < BITS_PER_WORD; i++) {
				629	// Multiply (s, a) by the next value from (b_s, b_a).
Adam Langley	eadef47	2019-01-16 17:20:24 -0800	[diff] [blame]	630	crypto_word_t m_s, m_a;
				631	poly3_word_mul(&m_s, &m_a, a_s, a_a, lsb_to_all(b_s), lsb_to_all(b_a));
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	632	b_s >>= 1;
				633	b_a >>= 1;
				634
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	635	if (i == 0) {
				636	// Special case otherwise the code tries to shift by BITS_PER_WORD
				637	// below, which is undefined.
				638	r_s_low = m_s;
				639	r_a_low = m_a;
				640	continue;
				641	}
				642
				643	// Shift the multiplication result to the correct position.
				644	const crypto_word_t m_s_low = m_s << i;
				645	const crypto_word_t m_s_high = m_s >> (BITS_PER_WORD - i);
				646	const crypto_word_t m_a_low = m_a << i;
				647	const crypto_word_t m_a_high = m_a >> (BITS_PER_WORD - i);
				648
				649	// Add into the result.
				650	poly3_word_add(&r_s_low, &r_a_low, r_s_low, r_a_low, m_s_low, m_a_low);
				651	poly3_word_add(&r_s_high, &r_a_high, r_s_high, r_a_high, m_s_high,
				652	m_a_high);
				653	}
				654
				655	out->s[0] = r_s_low;
				656	out->s[1] = r_s_high;
				657	out->a[0] = r_a_low;
				658	out->a[1] = r_a_high;
				659	return;
				660	}
				661
				662	// Karatsuba multiplication.
				663	// https://en.wikipedia.org/wiki/Karatsuba_algorithm
				664
				665	// When \|n\| is odd, the two "halves" will have different lengths. The first
				666	// is always the smaller.
				667	const size_t low_len = n / 2;
				668	const size_t high_len = n - low_len;
				669	const struct poly3_span a_high = {&a->s[low_len], &a->a[low_len]};
				670	const struct poly3_span b_high = {&b->s[low_len], &b->a[low_len]};
				671
				672	// Store a_1 + a_0 in the first half of \|out\| and b_1 + b_0 in the second
				673	// half.
				674	const struct poly3_span a_cross_sum = *out;
				675	const struct poly3_span b_cross_sum = {&out->s[high_len], &out->a[high_len]};
				676	poly3_span_add(&a_cross_sum, a, &a_high, low_len);
				677	poly3_span_add(&b_cross_sum, b, &b_high, low_len);
				678	if (high_len != low_len) {
				679	a_cross_sum.s[low_len] = a_high.s[low_len];
				680	a_cross_sum.a[low_len] = a_high.a[low_len];
				681	b_cross_sum.s[low_len] = b_high.s[low_len];
				682	b_cross_sum.a[low_len] = b_high.a[low_len];
				683	}
				684
				685	const struct poly3_span child_scratch = {&scratch->s[2 * high_len],
				686	&scratch->a[2 * high_len]};
				687	const struct poly3_span out_mid = {&out->s[low_len], &out->a[low_len]};
				688	const struct poly3_span out_high = {&out->s[2 * low_len],
				689	&out->a[2 * low_len]};
				690
				691	// Calculate (a_1 + a_0) × (b_1 + b_0) and write to scratch buffer.
				692	poly3_mul_aux(scratch, &child_scratch, &a_cross_sum, &b_cross_sum, high_len);
				693	// Calculate a_1 × b_1.
				694	poly3_mul_aux(&out_high, &child_scratch, &a_high, &b_high, high_len);
				695	// Calculate a_0 × b_0.
				696	poly3_mul_aux(out, &child_scratch, a, b, low_len);
				697
				698	// Subtract those last two products from the first.
				699	poly3_span_sub(scratch, out, low_len * 2);
				700	poly3_span_sub(scratch, &out_high, high_len * 2);
				701
				702	// Add the middle product into the output.
				703	poly3_span_add(&out_mid, &out_mid, scratch, high_len * 2);
				704	}
				705
				706	// HRSS_poly3_mul sets \|*out\| to \|x\|×\|y\| mod Φ(N).
				707	void HRSS_poly3_mul(struct poly3 out, const struct poly3 x,
				708	const struct poly3 *y) {
				709	crypto_word_t prod_s[WORDS_PER_POLY * 2];
				710	crypto_word_t prod_a[WORDS_PER_POLY * 2];
				711	crypto_word_t scratch_s[WORDS_PER_POLY * 2 + 2];
				712	crypto_word_t scratch_a[WORDS_PER_POLY * 2 + 2];
				713	const struct poly3_span prod_span = {prod_s, prod_a};
				714	const struct poly3_span scratch_span = {scratch_s, scratch_a};
				715	const struct poly3_span x_span = {(crypto_word_t *)x->s.v,
				716	(crypto_word_t *)x->a.v};
				717	const struct poly3_span y_span = {(crypto_word_t *)y->s.v,
				718	(crypto_word_t *)y->a.v};
				719
				720	poly3_mul_aux(&prod_span, &scratch_span, &x_span, &y_span, WORDS_PER_POLY);
				721
				722	// \|prod\| needs to be reduced mod (𝑥^n - 1), which just involves adding the
				723	// upper-half to the lower-half. However, N is 701, which isn't a multiple of
				724	// BITS_PER_WORD, so the upper-half vectors all have to be shifted before
				725	// being added to the lower-half.
				726	for (size_t i = 0; i < WORDS_PER_POLY; i++) {
				727	crypto_word_t v_s = prod_s[WORDS_PER_POLY + i - 1] >> BITS_IN_LAST_WORD;
				728	v_s \|= prod_s[WORDS_PER_POLY + i] << (BITS_PER_WORD - BITS_IN_LAST_WORD);
				729	crypto_word_t v_a = prod_a[WORDS_PER_POLY + i - 1] >> BITS_IN_LAST_WORD;
				730	v_a \|= prod_a[WORDS_PER_POLY + i] << (BITS_PER_WORD - BITS_IN_LAST_WORD);
				731
				732	poly3_word_add(&out->s.v[i], &out->a.v[i], prod_s[i], prod_a[i], v_s, v_a);
				733	}
				734
				735	poly3_mod_phiN(out);
				736	}
				737
				738	#if defined(HRSS_HAVE_VECTOR_UNIT) && !defined(OPENSSL_AARCH64)
				739
				740	// poly3_vec_cswap swaps (\|a_s\|, \|a_a\|) and (\|b_s\|, \|b_a\|) if \|swap\| is
				741	// \|0xff..ff\|. Otherwise, \|swap\| must be zero.
				742	static inline void poly3_vec_cswap(vec_t a_s[6], vec_t a_a[6], vec_t b_s[6],
				743	vec_t b_a[6], const vec_t swap) {
				744	for (int i = 0; i < 6; i++) {
				745	const vec_t sum_s = swap & (a_s[i] ^ b_s[i]);
				746	a_s[i] ^= sum_s;
				747	b_s[i] ^= sum_s;
				748
				749	const vec_t sum_a = swap & (a_a[i] ^ b_a[i]);
				750	a_a[i] ^= sum_a;
				751	b_a[i] ^= sum_a;
				752	}
				753	}
				754
Adam Langley	eadef47	2019-01-16 17:20:24 -0800	[diff] [blame]	755	// poly3_vec_fmsub subtracts (\|ms\|, \|ma\|) × (\|b_s\|, \|b_a\|) from (\|a_s\|, \|a_a\|).
				756	static inline void poly3_vec_fmsub(vec_t a_s[6], vec_t a_a[6], vec_t b_s[6],
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	757	vec_t b_a[6], const vec_t ms,
				758	const vec_t ma) {
				759	for (int i = 0; i < 6; i++) {
Adam Langley	eadef47	2019-01-16 17:20:24 -0800	[diff] [blame]	760	// See the bitslice formula, above.
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	761	const vec_t s = b_s[i];
				762	const vec_t a = b_a[i];
Adam Langley	eadef47	2019-01-16 17:20:24 -0800	[diff] [blame]	763	const vec_t product_a = a & ma;
				764	const vec_t product_s = (s ^ ms) & product_a;
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	765
Adam Langley	eadef47	2019-01-16 17:20:24 -0800	[diff] [blame]	766	const vec_t out_s = a_s[i];
				767	const vec_t out_a = a_a[i];
				768	const vec_t t = out_a ^ product_a;
				769	a_s[i] = (out_s ^ product_a) & (t ^ product_s);
				770	a_a[i] = t \| (out_s ^ product_s);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	771	}
				772	}
				773
				774	// poly3_invert_vec sets \|*out\| to \|in\|^-1, i.e. such that \|out\|×\|in\| == 1 mod
				775	// Φ(N).
				776	static void poly3_invert_vec(struct poly3 out, const struct poly3 in) {
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	777	// This algorithm is taken from section 7.1 of [SAFEGCD].
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	778	const vec_t kZero = {0};
				779	const vec_t kOne = {1};
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	780	static const uint8_t kBottomSixtyOne[sizeof(vec_t)] = {
				781	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x1f};
				782
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	783	vec_t v_s[6], v_a[6], r_s[6], r_a[6], f_s[6], f_a[6], g_s[6], g_a[6];
				784	// v = 0
				785	memset(&v_s, 0, sizeof(v_s));
				786	memset(&v_a, 0, sizeof(v_a));
				787	// r = 1
				788	memset(&r_s, 0, sizeof(r_s));
				789	memset(&r_a, 0, sizeof(r_a));
				790	r_a[0] = kOne;
				791	// f = all ones.
				792	memset(f_s, 0, sizeof(f_s));
				793	memset(f_a, 0xff, 5 * sizeof(vec_t));
				794	memcpy(&f_a[5], kBottomSixtyOne, sizeof(kBottomSixtyOne));
				795	// g is the reversal of \|in\|.
				796	struct poly3 in_reversed;
				797	poly3_reverse_700(&in_reversed, in);
				798	g_s[5] = kZero;
				799	memcpy(&g_s, &in_reversed.s.v, WORDS_PER_POLY * sizeof(crypto_word_t));
				800	g_a[5] = kZero;
				801	memcpy(&g_a, &in_reversed.a.v, WORDS_PER_POLY * sizeof(crypto_word_t));
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	802
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	803	int delta = 1;
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	804
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	805	for (size_t i = 0; i < (2*(N-1)) - 1; i++) {
				806	poly3_vec_lshift1(v_s, v_a);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	807
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	808	const crypto_word_t delta_sign_bit = (delta >> (sizeof(delta) * 8 - 1)) & 1;
				809	const crypto_word_t delta_is_non_negative = delta_sign_bit - 1;
				810	const crypto_word_t delta_is_non_zero = ~constant_time_is_zero_w(delta);
				811	const vec_t g_has_constant_term = vec_broadcast_bit(g_a[0]);
				812	const vec_t mask_w =
				813	{delta_is_non_negative & delta_is_non_zero};
				814	const vec_t mask = vec_broadcast_bit(mask_w) & g_has_constant_term;
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	815
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	816	const vec_t c_a = vec_broadcast_bit(f_a[0] & g_a[0]);
				817	const vec_t c_s = vec_broadcast_bit((f_s[0] ^ g_s[0]) & c_a);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	818
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	819	delta = constant_time_select_int(lsb_to_all(mask[0]), -delta, delta);
				820	delta++;
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	821
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	822	poly3_vec_cswap(f_s, f_a, g_s, g_a, mask);
				823	poly3_vec_fmsub(g_s, g_a, f_s, f_a, c_s, c_a);
				824	poly3_vec_rshift1(g_s, g_a);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	825
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	826	poly3_vec_cswap(v_s, v_a, r_s, r_a, mask);
				827	poly3_vec_fmsub(r_s, r_a, v_s, v_a, c_s, c_a);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	828	}
				829
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	830	assert(delta == 0);
				831	memcpy(out->s.v, v_s, WORDS_PER_POLY * sizeof(crypto_word_t));
				832	memcpy(out->a.v, v_a, WORDS_PER_POLY * sizeof(crypto_word_t));
				833	poly3_mul_const(out, vec_get_word(f_s[0], 0), vec_get_word(f_a[0], 0));
				834	poly3_reverse_700(out, out);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	835	}
				836
				837	#endif // HRSS_HAVE_VECTOR_UNIT
				838
				839	// HRSS_poly3_invert sets \|*out\| to \|in\|^-1, i.e. such that \|out\|×\|in\| == 1 mod
				840	// Φ(N).
				841	void HRSS_poly3_invert(struct poly3 out, const struct poly3 in) {
				842	// The vector version of this function seems slightly slower on AArch64, but
				843	// is useful on ARMv7 and x86-64.
				844	#if defined(HRSS_HAVE_VECTOR_UNIT) && !defined(OPENSSL_AARCH64)
				845	if (vec_capable()) {
				846	poly3_invert_vec(out, in);
				847	return;
				848	}
				849	#endif
				850
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	851	// This algorithm is taken from section 7.1 of [SAFEGCD].
				852	struct poly3 v, r, f, g;
				853	// v = 0
				854	poly3_zero(&v);
				855	// r = 1
				856	poly3_zero(&r);
				857	r.a.v[0] = 1;
				858	// f = all ones.
				859	OPENSSL_memset(&f.s, 0, sizeof(struct poly2));
				860	OPENSSL_memset(&f.a, 0xff, sizeof(struct poly2));
				861	f.a.v[WORDS_PER_POLY - 1] >>= BITS_PER_WORD - BITS_IN_LAST_WORD;
				862	// g is the reversal of \|in\|.
				863	poly3_reverse_700(&g, in);
				864	int delta = 1;
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	865
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	866	for (size_t i = 0; i < (2*(N-1)) - 1; i++) {
				867	poly3_lshift1(&v);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	868
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	869	const crypto_word_t delta_sign_bit = (delta >> (sizeof(delta) * 8 - 1)) & 1;
				870	const crypto_word_t delta_is_non_negative = delta_sign_bit - 1;
				871	const crypto_word_t delta_is_non_zero = ~constant_time_is_zero_w(delta);
				872	const crypto_word_t g_has_constant_term = lsb_to_all(g.a.v[0]);
				873	const crypto_word_t mask =
				874	g_has_constant_term & delta_is_non_negative & delta_is_non_zero;
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	875
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	876	crypto_word_t c_s, c_a;
				877	poly3_word_mul(&c_s, &c_a, f.s.v[0], f.a.v[0], g.s.v[0], g.a.v[0]);
				878	c_s = lsb_to_all(c_s);
				879	c_a = lsb_to_all(c_a);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	880
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	881	delta = constant_time_select_int(mask, -delta, delta);
				882	delta++;
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	883
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	884	poly3_cswap(&f, &g, mask);
				885	poly3_fmsub(&g, &f, c_s, c_a);
				886	poly3_rshift1(&g);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	887
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	888	poly3_cswap(&v, &r, mask);
				889	poly3_fmsub(&r, &v, c_s, c_a);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	890	}
				891
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	892	assert(delta == 0);
				893	poly3_mul_const(&v, f.s.v[0], f.a.v[0]);
				894	poly3_reverse_700(out, &v);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	895	}
				896
				897	// Polynomials in Q.
				898
				899	// Coefficients are reduced mod Q. (Q is clearly not prime, therefore the
				900	// coefficients do not form a field.)
				901	#define Q 8192
				902
				903	// VECS_PER_POLY is the number of 128-bit vectors needed to represent a
				904	// polynomial.
				905	#define COEFFICIENTS_PER_VEC (sizeof(vec_t) / sizeof(uint16_t))
				906	#define VECS_PER_POLY ((N + COEFFICIENTS_PER_VEC - 1) / COEFFICIENTS_PER_VEC)
				907
				908	// poly represents a polynomial with coefficients mod Q. Note that, while Q is a
				909	// power of two, this does not operate in GF(Q). That would be a binary field
				910	// but this is simply mod Q. Thus the coefficients are not a field.
				911	//
				912	// Coefficients are ordered little-endian, thus the coefficient of x^0 is the
				913	// first element of the array.
				914	struct poly {
				915	#if defined(HRSS_HAVE_VECTOR_UNIT)
				916	union {
				917	// N + 3 = 704, which is a multiple of 64 and thus aligns things, esp for
				918	// the vector code.
				919	uint16_t v[N + 3];
				920	vec_t vectors[VECS_PER_POLY];
				921	};
				922	#else
Adam Langley	1ea083d	2018-12-12 10:46:13 -0800	[diff] [blame]	923	// Even if !HRSS_HAVE_VECTOR_UNIT, external assembly may be called that
				924	// requires alignment.
				925	alignas(16) uint16_t v[N + 3];
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	926	#endif
				927	};
				928
Adam Langley	f7e1a94	2022-04-01 11:46:32 -0700	[diff] [blame]	929	// poly_normalize zeros out the excess elements of \|x\| which are included only
				930	// for alignment.
				931	static void poly_normalize(struct poly *x) {
				932	OPENSSL_memset(&x->v[N], 0, 3 * sizeof(uint16_t));
				933	}
				934
				935	// poly_assert_normalized asserts that the excess elements of \|x\| are zeroed out
				936	// for the cases that case. (E.g. \|poly_mul_vec\|.)
				937	static void poly_assert_normalized(const struct poly *x) {
				938	assert(x->v[N] == 0);
				939	assert(x->v[N + 1] == 0);
				940	assert(x->v[N + 2] == 0);
				941	}
				942
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	943	OPENSSL_UNUSED static void poly_print(const struct poly *p) {
				944	printf("[");
				945	for (unsigned i = 0; i < N; i++) {
				946	if (i) {
				947	printf(" ");
				948	}
				949	printf("%d", p->v[i]);
				950	}
				951	printf("]\n");
				952	}
				953
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	954	// POLY_MUL_SCRATCH contains space for the working variables needed by
				955	// \|poly_mul\|. The contents afterwards may be discarded, but the object may also
				956	// be reused with future \|poly_mul\| calls to save heap allocations.
				957	//
				958	// This object must have 32-byte alignment.
				959	struct POLY_MUL_SCRATCH {
				960	union {
				961	// This is used by \|poly_mul_novec\|.
				962	struct {
				963	uint16_t prod[2 * N];
				964	uint16_t scratch[1318];
				965	} novec;
				966
				967	#if defined(HRSS_HAVE_VECTOR_UNIT)
				968	// This is used by \|poly_mul_vec\|.
				969	struct {
				970	vec_t prod[VECS_PER_POLY * 2];
				971	vec_t scratch[172];
				972	} vec;
				973	#endif
Adam Langley	04b3a96	2023-02-08 21:08:49 +0000	[diff] [blame]	974
				975	#if defined(POLY_RQ_MUL_ASM)
				976	// This is the space used by \|poly_Rq_mul\|.
				977	uint8_t rq[POLY_MUL_RQ_SCRATCH_SPACE];
				978	#endif
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	979	} u;
				980	};
				981
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	982	#if defined(HRSS_HAVE_VECTOR_UNIT)
				983
				984	// poly_mul_vec_aux is a recursive function that multiplies \|n\| words from \|a\|
				985	// and \|b\| and writes 2×\|n\| words to \|out\|. Each call uses 2*ceil(n/2) elements
				986	// of \|scratch\| and the function recurses, except if \|n\| < 3, when \|scratch\|
				987	// isn't used and the recursion stops. If \|n\| == \|VECS_PER_POLY\| then \|scratch\|
				988	// needs 172 elements.
				989	static void poly_mul_vec_aux(vec_t restrict out, vec_t restrict scratch,
				990	const vec_t restrict a, const vec_t restrict b,
				991	const size_t n) {
				992	// In [HRSS], the technique they used for polynomial multiplication is
				993	// described: they start with Toom-4 at the top level and then two layers of
				994	// Karatsuba. Karatsuba is a specific instance of the general Toom–Cook
				995	// decomposition, which splits an input n-ways and produces 2n-1
				996	// multiplications of those parts. So, starting with 704 coefficients (rounded
				997	// up from 701 to have more factors of two), Toom-4 gives seven
				998	// multiplications of degree-174 polynomials. Each round of Karatsuba (which
				999	// is Toom-2) increases the number of multiplications by a factor of three
				1000	// while halving the size of the values being multiplied. So two rounds gives
				1001	// 63 multiplications of degree-44 polynomials. Then they (I think) form
				1002	// vectors by gathering all 63 coefficients of each power together, for each
				1003	// input, and doing more rounds of Karatsuba on the vectors until they bottom-
				1004	// out somewhere with schoolbook multiplication.
				1005	//
				1006	// I tried something like that for NEON. NEON vectors are 128 bits so hold
				1007	// eight coefficients. I wrote a function that did Karatsuba on eight
				1008	// multiplications at the same time, using such vectors, and a Go script that
				1009	// decomposed from degree-704, with Karatsuba in non-transposed form, until it
				1010	// reached multiplications of degree-44. It batched up those 81
				1011	// multiplications into lots of eight with a single one left over (which was
				1012	// handled directly).
				1013	//
				1014	// It worked, but it was significantly slower than the dumb algorithm used
				1015	// below. Potentially that was because I misunderstood how [HRSS] did it, or
				1016	// because Clang is bad at generating good code from NEON intrinsics on ARMv7.
				1017	// (Which is true: the code generated by Clang for the below is pretty crap.)
				1018	//
				1019	// This algorithm is much simpler. It just does Karatsuba decomposition all
				1020	// the way down and never transposes. When it gets down to degree-16 or
				1021	// degree-24 values, they are multiplied using schoolbook multiplication and
				1022	// vector intrinsics. The vector operations form each of the eight phase-
				1023	// shifts of one of the inputs, point-wise multiply, and then add into the
				1024	// result at the correct place. This means that 33% (degree-16) or 25%
				1025	// (degree-24) of the multiplies and adds are wasted, but it does ok.
				1026	if (n == 2) {
				1027	vec_t result[4];
				1028	vec_t vec_a[3];
				1029	static const vec_t kZero = {0};
				1030	vec_a[0] = a[0];
				1031	vec_a[1] = a[1];
				1032	vec_a[2] = kZero;
				1033
				1034	result[0] = vec_mul(vec_a[0], vec_get_word(b[0], 0));
				1035	result[1] = vec_mul(vec_a[1], vec_get_word(b[0], 0));
				1036
				1037	result[1] = vec_fma(result[1], vec_a[0], vec_get_word(b[1], 0));
				1038	result[2] = vec_mul(vec_a[1], vec_get_word(b[1], 0));
				1039	result[3] = kZero;
				1040
				1041	vec3_rshift_word(vec_a);
				1042
				1043	#define BLOCK(x, y) \
				1044	do { \
				1045	result[x + 0] = \
				1046	vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \
				1047	result[x + 1] = \
				1048	vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \
				1049	result[x + 2] = \
				1050	vec_fma(result[x + 2], vec_a[2], vec_get_word(b[y / 8], y % 8)); \
				1051	} while (0)
				1052
				1053	BLOCK(0, 1);
				1054	BLOCK(1, 9);
				1055
				1056	vec3_rshift_word(vec_a);
				1057
				1058	BLOCK(0, 2);
				1059	BLOCK(1, 10);
				1060
				1061	vec3_rshift_word(vec_a);
				1062
				1063	BLOCK(0, 3);
				1064	BLOCK(1, 11);
				1065
				1066	vec3_rshift_word(vec_a);
				1067
				1068	BLOCK(0, 4);
				1069	BLOCK(1, 12);
				1070
				1071	vec3_rshift_word(vec_a);
				1072
				1073	BLOCK(0, 5);
				1074	BLOCK(1, 13);
				1075
				1076	vec3_rshift_word(vec_a);
				1077
				1078	BLOCK(0, 6);
				1079	BLOCK(1, 14);
				1080
				1081	vec3_rshift_word(vec_a);
				1082
				1083	BLOCK(0, 7);
				1084	BLOCK(1, 15);
				1085
				1086	#undef BLOCK
				1087
				1088	memcpy(out, result, sizeof(result));
				1089	return;
				1090	}
				1091
				1092	if (n == 3) {
				1093	vec_t result[6];
				1094	vec_t vec_a[4];
				1095	static const vec_t kZero = {0};
				1096	vec_a[0] = a[0];
				1097	vec_a[1] = a[1];
				1098	vec_a[2] = a[2];
				1099	vec_a[3] = kZero;
				1100
				1101	result[0] = vec_mul(a[0], vec_get_word(b[0], 0));
				1102	result[1] = vec_mul(a[1], vec_get_word(b[0], 0));
				1103	result[2] = vec_mul(a[2], vec_get_word(b[0], 0));
				1104
				1105	#define BLOCK_PRE(x, y) \
				1106	do { \
				1107	result[x + 0] = \
				1108	vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \
				1109	result[x + 1] = \
				1110	vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \
				1111	result[x + 2] = vec_mul(vec_a[2], vec_get_word(b[y / 8], y % 8)); \
				1112	} while (0)
				1113
				1114	BLOCK_PRE(1, 8);
				1115	BLOCK_PRE(2, 16);
				1116
				1117	result[5] = kZero;
				1118
				1119	vec4_rshift_word(vec_a);
				1120
				1121	#define BLOCK(x, y) \
				1122	do { \
				1123	result[x + 0] = \
				1124	vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \
				1125	result[x + 1] = \
				1126	vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \
				1127	result[x + 2] = \
				1128	vec_fma(result[x + 2], vec_a[2], vec_get_word(b[y / 8], y % 8)); \
				1129	result[x + 3] = \
				1130	vec_fma(result[x + 3], vec_a[3], vec_get_word(b[y / 8], y % 8)); \
				1131	} while (0)
				1132
				1133	BLOCK(0, 1);
				1134	BLOCK(1, 9);
				1135	BLOCK(2, 17);
				1136
				1137	vec4_rshift_word(vec_a);
				1138
				1139	BLOCK(0, 2);
				1140	BLOCK(1, 10);
				1141	BLOCK(2, 18);
				1142
				1143	vec4_rshift_word(vec_a);
				1144
				1145	BLOCK(0, 3);
				1146	BLOCK(1, 11);
				1147	BLOCK(2, 19);
				1148
				1149	vec4_rshift_word(vec_a);
				1150
				1151	BLOCK(0, 4);
				1152	BLOCK(1, 12);
				1153	BLOCK(2, 20);
				1154
				1155	vec4_rshift_word(vec_a);
				1156
				1157	BLOCK(0, 5);
				1158	BLOCK(1, 13);
				1159	BLOCK(2, 21);
				1160
				1161	vec4_rshift_word(vec_a);
				1162
				1163	BLOCK(0, 6);
				1164	BLOCK(1, 14);
				1165	BLOCK(2, 22);
				1166
				1167	vec4_rshift_word(vec_a);
				1168
				1169	BLOCK(0, 7);
				1170	BLOCK(1, 15);
				1171	BLOCK(2, 23);
				1172
				1173	#undef BLOCK
				1174	#undef BLOCK_PRE
				1175
				1176	memcpy(out, result, sizeof(result));
				1177
				1178	return;
				1179	}
				1180
				1181	// Karatsuba multiplication.
				1182	// https://en.wikipedia.org/wiki/Karatsuba_algorithm
				1183
				1184	// When \|n\| is odd, the two "halves" will have different lengths. The first is
				1185	// always the smaller.
				1186	const size_t low_len = n / 2;
				1187	const size_t high_len = n - low_len;
				1188	const vec_t *a_high = &a[low_len];
				1189	const vec_t *b_high = &b[low_len];
				1190
				1191	// Store a_1 + a_0 in the first half of \|out\| and b_1 + b_0 in the second
				1192	// half.
				1193	for (size_t i = 0; i < low_len; i++) {
				1194	out[i] = vec_add(a_high[i], a[i]);
				1195	out[high_len + i] = vec_add(b_high[i], b[i]);
				1196	}
				1197	if (high_len != low_len) {
				1198	out[low_len] = a_high[low_len];
				1199	out[high_len + low_len] = b_high[low_len];
				1200	}
				1201
				1202	vec_t const child_scratch = &scratch[2 high_len];
				1203	// Calculate (a_1 + a_0) × (b_1 + b_0) and write to scratch buffer.
				1204	poly_mul_vec_aux(scratch, child_scratch, out, &out[high_len], high_len);
				1205	// Calculate a_1 × b_1.
				1206	poly_mul_vec_aux(&out[low_len * 2], child_scratch, a_high, b_high, high_len);
				1207	// Calculate a_0 × b_0.
				1208	poly_mul_vec_aux(out, child_scratch, a, b, low_len);
				1209
				1210	// Subtract those last two products from the first.
				1211	for (size_t i = 0; i < low_len * 2; i++) {
				1212	scratch[i] = vec_sub(scratch[i], vec_add(out[i], out[low_len * 2 + i]));
				1213	}
				1214	if (low_len != high_len) {
				1215	scratch[low_len * 2] = vec_sub(scratch[low_len * 2], out[low_len * 4]);
				1216	scratch[low_len * 2 + 1] =
				1217	vec_sub(scratch[low_len * 2 + 1], out[low_len * 4 + 1]);
				1218	}
				1219
				1220	// Add the middle product into the output.
				1221	for (size_t i = 0; i < high_len * 2; i++) {
				1222	out[low_len + i] = vec_add(out[low_len + i], scratch[i]);
				1223	}
				1224	}
				1225
				1226	// poly_mul_vec sets \|*out\| to \|x\|×\|y\| mod (𝑥^n - 1).
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	1227	static void poly_mul_vec(struct POLY_MUL_SCRATCH scratch, struct poly out,
				1228	const struct poly x, const struct poly y) {
David Benjamin	b7d6320	2022-07-26 13:25:02 -0700	[diff] [blame]	1229	static_assert(sizeof(out->v) == sizeof(vec_t) * VECS_PER_POLY,
				1230	"struct poly is the wrong size");
				1231	static_assert(alignof(struct poly) == alignof(vec_t),
				1232	"struct poly has incorrect alignment");
Adam Langley	f7e1a94	2022-04-01 11:46:32 -0700	[diff] [blame]	1233	poly_assert_normalized(x);
				1234	poly_assert_normalized(y);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1235
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	1236	vec_t *const prod = scratch->u.vec.prod;
				1237	vec_t *const aux_scratch = scratch->u.vec.scratch;
				1238	poly_mul_vec_aux(prod, aux_scratch, x->vectors, y->vectors, VECS_PER_POLY);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1239
				1240	// \|prod\| needs to be reduced mod (𝑥^n - 1), which just involves adding the
				1241	// upper-half to the lower-half. However, N is 701, which isn't a multiple of
				1242	// the vector size, so the upper-half vectors all have to be shifted before
				1243	// being added to the lower-half.
				1244	vec_t out_vecs = (vec_t )out->v;
				1245
				1246	for (size_t i = 0; i < VECS_PER_POLY; i++) {
				1247	const vec_t prev = prod[VECS_PER_POLY - 1 + i];
				1248	const vec_t this = prod[VECS_PER_POLY + i];
				1249	out_vecs[i] = vec_add(prod[i], vec_merge_3_5(prev, this));
				1250	}
				1251
				1252	OPENSSL_memset(&out->v[N], 0, 3 * sizeof(uint16_t));
				1253	}
				1254
				1255	#endif // HRSS_HAVE_VECTOR_UNIT
				1256
				1257	// poly_mul_novec_aux writes the product of \|a\| and \|b\| to \|out\|, using
				1258	// \|scratch\| as scratch space. It'll use Karatsuba if the inputs are large
				1259	// enough to warrant it. Each call uses 2*ceil(n/2) elements of \|scratch\| and
				1260	// the function recurses, except if \|n\| < 64, when \|scratch\| isn't used and the
				1261	// recursion stops. If \|n\| == \|N\| then \|scratch\| needs 1318 elements.
				1262	static void poly_mul_novec_aux(uint16_t out, uint16_t scratch,
				1263	const uint16_t a, const uint16_t b, size_t n) {
				1264	static const size_t kSchoolbookLimit = 64;
				1265	if (n < kSchoolbookLimit) {
				1266	OPENSSL_memset(out, 0, sizeof(uint16_t) * n * 2);
				1267	for (size_t i = 0; i < n; i++) {
				1268	for (size_t j = 0; j < n; j++) {
				1269	out[i + j] += (unsigned) a[i] * b[j];
				1270	}
				1271	}
				1272
				1273	return;
				1274	}
				1275
				1276	// Karatsuba multiplication.
				1277	// https://en.wikipedia.org/wiki/Karatsuba_algorithm
				1278
				1279	// When \|n\| is odd, the two "halves" will have different lengths. The
				1280	// first is always the smaller.
				1281	const size_t low_len = n / 2;
				1282	const size_t high_len = n - low_len;
				1283	const uint16_t *const a_high = &a[low_len];
				1284	const uint16_t *const b_high = &b[low_len];
				1285
				1286	for (size_t i = 0; i < low_len; i++) {
				1287	out[i] = a_high[i] + a[i];
				1288	out[high_len + i] = b_high[i] + b[i];
				1289	}
				1290	if (high_len != low_len) {
				1291	out[low_len] = a_high[low_len];
				1292	out[high_len + low_len] = b_high[low_len];
				1293	}
				1294
				1295	uint16_t const child_scratch = &scratch[2 high_len];
				1296	poly_mul_novec_aux(scratch, child_scratch, out, &out[high_len], high_len);
				1297	poly_mul_novec_aux(&out[low_len * 2], child_scratch, a_high, b_high,
				1298	high_len);
				1299	poly_mul_novec_aux(out, child_scratch, a, b, low_len);
				1300
				1301	for (size_t i = 0; i < low_len * 2; i++) {
				1302	scratch[i] -= out[i] + out[low_len * 2 + i];
				1303	}
				1304	if (low_len != high_len) {
				1305	scratch[low_len * 2] -= out[low_len * 4];
				1306	assert(out[low_len * 4 + 1] == 0);
				1307	}
				1308
				1309	for (size_t i = 0; i < high_len * 2; i++) {
				1310	out[low_len + i] += scratch[i];
				1311	}
				1312	}
				1313
				1314	// poly_mul_novec sets \|*out\| to \|x\|×\|y\| mod (𝑥^n - 1).
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	1315	static void poly_mul_novec(struct POLY_MUL_SCRATCH scratch, struct poly out,
				1316	const struct poly x, const struct poly y) {
				1317	uint16_t *const prod = scratch->u.novec.prod;
				1318	uint16_t *const aux_scratch = scratch->u.novec.scratch;
				1319	poly_mul_novec_aux(prod, aux_scratch, x->v, y->v, N);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1320
				1321	for (size_t i = 0; i < N; i++) {
				1322	out->v[i] = prod[i] + prod[i + N];
				1323	}
				1324	OPENSSL_memset(&out->v[N], 0, 3 * sizeof(uint16_t));
				1325	}
				1326
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	1327	static void poly_mul(struct POLY_MUL_SCRATCH scratch, struct poly r,
				1328	const struct poly a, const struct poly b) {
Adam Langley	04b3a96	2023-02-08 21:08:49 +0000	[diff] [blame]	1329	#if defined(POLY_RQ_MUL_ASM)
				1330	if (CRYPTO_is_AVX2_capable()) {
				1331	poly_Rq_mul(r->v, a->v, b->v, scratch->u.rq);
				1332	poly_normalize(r);
				1333	} else
				1334	#endif
				1335
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1336	#if defined(HRSS_HAVE_VECTOR_UNIT)
				1337	if (vec_capable()) {
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	1338	poly_mul_vec(scratch, r, a, b);
Adam Langley	f7e1a94	2022-04-01 11:46:32 -0700	[diff] [blame]	1339	} else
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1340	#endif
				1341
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1342	// Fallback, non-vector case.
Adam Langley	f7e1a94	2022-04-01 11:46:32 -0700	[diff] [blame]	1343	{
				1344	poly_mul_novec(scratch, r, a, b);
				1345	}
				1346
				1347	poly_assert_normalized(r);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1348	}
				1349
				1350	// poly_mul_x_minus_1 sets \|p\| to \|p\|×(𝑥 - 1) mod (𝑥^n - 1).
				1351	static void poly_mul_x_minus_1(struct poly *p) {
				1352	// Multiplying by (𝑥 - 1) means negating each coefficient and adding in
				1353	// the value of the previous one.
				1354	const uint16_t orig_final_coefficient = p->v[N - 1];
				1355
				1356	for (size_t i = N - 1; i > 0; i--) {
				1357	p->v[i] = p->v[i - 1] - p->v[i];
				1358	}
				1359	p->v[0] = orig_final_coefficient - p->v[0];
				1360	}
				1361
				1362	// poly_mod_phiN sets \|p\| to \|p\| mod Φ(N).
				1363	static void poly_mod_phiN(struct poly *p) {
				1364	const uint16_t coeff700 = p->v[N - 1];
				1365
				1366	for (unsigned i = 0; i < N; i++) {
				1367	p->v[i] -= coeff700;
				1368	}
				1369	}
				1370
				1371	// poly_clamp reduces each coefficient mod Q.
				1372	static void poly_clamp(struct poly *p) {
				1373	for (unsigned i = 0; i < N; i++) {
				1374	p->v[i] &= Q - 1;
				1375	}
				1376	}
				1377
				1378
				1379	// Conversion functions
				1380	// --------------------
				1381
				1382	// poly2_from_poly sets \|*out\| to \|in\| mod 2.
				1383	static void poly2_from_poly(struct poly2 out, const struct poly in) {
				1384	crypto_word_t *words = out->v;
				1385	unsigned shift = 0;
				1386	crypto_word_t word = 0;
				1387
				1388	for (unsigned i = 0; i < N; i++) {
				1389	word >>= 1;
				1390	word \|= (crypto_word_t)(in->v[i] & 1) << (BITS_PER_WORD - 1);
				1391	shift++;
				1392
				1393	if (shift == BITS_PER_WORD) {
				1394	*words = word;
				1395	words++;
				1396	word = 0;
				1397	shift = 0;
				1398	}
				1399	}
				1400
				1401	word >>= BITS_PER_WORD - shift;
				1402	*words = word;
				1403	}
				1404
Adam Langley	72f0155	2019-01-21 12:53:43 -0800	[diff] [blame]	1405	// mod3 treats \|a\| as a signed number and returns \|a\| mod 3.
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1406	static uint16_t mod3(int16_t a) {
				1407	const int16_t q = ((int32_t)a * 21845) >> 16;
				1408	int16_t ret = a - 3 * q;
				1409	// At this point, \|ret\| is in {0, 1, 2, 3} and that needs to be mapped to {0,
				1410	// 1, 2, 0}.
				1411	return ret & ((ret & (ret >> 1)) - 1);
				1412	}
				1413
				1414	// poly3_from_poly sets \|*out\| to \|in\|.
				1415	static void poly3_from_poly(struct poly3 out, const struct poly in) {
				1416	crypto_word_t *words_s = out->s.v;
				1417	crypto_word_t *words_a = out->a.v;
				1418	crypto_word_t s = 0;
				1419	crypto_word_t a = 0;
				1420	unsigned shift = 0;
				1421
				1422	for (unsigned i = 0; i < N; i++) {
				1423	// This duplicates the 13th bit upwards to the top of the uint16,
				1424	// essentially treating it as a sign bit and converting into a signed int16.
				1425	// The signed value is reduced mod 3, yielding {0, 1, 2}.
				1426	const uint16_t v = mod3((int16_t)(in->v[i] << 3) >> 3);
				1427	s >>= 1;
Adam Langley	eadef47	2019-01-16 17:20:24 -0800	[diff] [blame]	1428	const crypto_word_t s_bit = (crypto_word_t)(v & 2) << (BITS_PER_WORD - 2);
				1429	s \|= s_bit;
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1430	a >>= 1;
Adam Langley	eadef47	2019-01-16 17:20:24 -0800	[diff] [blame]	1431	a \|= s_bit \| (crypto_word_t)(v & 1) << (BITS_PER_WORD - 1);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1432	shift++;
				1433
				1434	if (shift == BITS_PER_WORD) {
				1435	*words_s = s;
				1436	words_s++;
				1437	*words_a = a;
				1438	words_a++;
				1439	s = a = 0;
				1440	shift = 0;
				1441	}
				1442	}
				1443
				1444	s >>= BITS_PER_WORD - shift;
				1445	a >>= BITS_PER_WORD - shift;
				1446	*words_s = s;
				1447	*words_a = a;
				1448	}
				1449
				1450	// poly3_from_poly_checked sets \|*out\| to \|in\|, which has coefficients in {0, 1,
				1451	// Q-1}. It returns a mask indicating whether all coefficients were found to be
				1452	// in that set.
				1453	static crypto_word_t poly3_from_poly_checked(struct poly3 *out,
				1454	const struct poly *in) {
				1455	crypto_word_t *words_s = out->s.v;
				1456	crypto_word_t *words_a = out->a.v;
				1457	crypto_word_t s = 0;
				1458	crypto_word_t a = 0;
				1459	unsigned shift = 0;
				1460	crypto_word_t ok = CONSTTIME_TRUE_W;
				1461
				1462	for (unsigned i = 0; i < N; i++) {
				1463	const uint16_t v = in->v[i];
				1464	// Maps {0, 1, Q-1} to {0, 1, 2}.
				1465	uint16_t mod3 = v & 3;
				1466	mod3 ^= mod3 >> 1;
				1467	const uint16_t expected = (uint16_t)((~((mod3 >> 1) - 1)) \| mod3) % Q;
				1468	ok &= constant_time_eq_w(v, expected);
				1469
				1470	s >>= 1;
Adam Langley	eadef47	2019-01-16 17:20:24 -0800	[diff] [blame]	1471	const crypto_word_t s_bit = (crypto_word_t)(mod3 & 2)
				1472	<< (BITS_PER_WORD - 2);
				1473	s \|= s_bit;
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1474	a >>= 1;
Adam Langley	eadef47	2019-01-16 17:20:24 -0800	[diff] [blame]	1475	a \|= s_bit \| (crypto_word_t)(mod3 & 1) << (BITS_PER_WORD - 1);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1476	shift++;
				1477
				1478	if (shift == BITS_PER_WORD) {
				1479	*words_s = s;
				1480	words_s++;
				1481	*words_a = a;
				1482	words_a++;
				1483	s = a = 0;
				1484	shift = 0;
				1485	}
				1486	}
				1487
				1488	s >>= BITS_PER_WORD - shift;
				1489	a >>= BITS_PER_WORD - shift;
				1490	*words_s = s;
				1491	*words_a = a;
				1492
				1493	return ok;
				1494	}
				1495
				1496	static void poly_from_poly2(struct poly out, const struct poly2 in) {
				1497	const crypto_word_t *words = in->v;
				1498	unsigned shift = 0;
				1499	crypto_word_t word = *words;
				1500
				1501	for (unsigned i = 0; i < N; i++) {
				1502	out->v[i] = word & 1;
				1503	word >>= 1;
				1504	shift++;
				1505
				1506	if (shift == BITS_PER_WORD) {
				1507	words++;
				1508	word = *words;
				1509	shift = 0;
				1510	}
				1511	}
Adam Langley	f7e1a94	2022-04-01 11:46:32 -0700	[diff] [blame]	1512
				1513	poly_normalize(out);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1514	}
				1515
				1516	static void poly_from_poly3(struct poly out, const struct poly3 in) {
				1517	const crypto_word_t *words_s = in->s.v;
				1518	const crypto_word_t *words_a = in->a.v;
				1519	crypto_word_t word_s = ~(*words_s);
				1520	crypto_word_t word_a = *words_a;
				1521	unsigned shift = 0;
				1522
				1523	for (unsigned i = 0; i < N; i++) {
				1524	out->v[i] = (uint16_t)(word_s & 1) - 1;
				1525	out->v[i] \|= word_a & 1;
				1526	word_s >>= 1;
				1527	word_a >>= 1;
				1528	shift++;
				1529
				1530	if (shift == BITS_PER_WORD) {
				1531	words_s++;
				1532	words_a++;
				1533	word_s = ~(*words_s);
				1534	word_a = *words_a;
				1535	shift = 0;
				1536	}
				1537	}
Adam Langley	f7e1a94	2022-04-01 11:46:32 -0700	[diff] [blame]	1538
				1539	poly_normalize(out);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1540	}
				1541
				1542	// Polynomial inversion
				1543	// --------------------
				1544
				1545	// poly_invert_mod2 sets \|out\| to \|in^-1\| (i.e. such that \|out\|×\|in\| = 1 mod
				1546	// Φ(N)), all mod 2. This isn't useful in itself, but is part of doing inversion
				1547	// mod Q.
				1548	static void poly_invert_mod2(struct poly out, const struct poly in) {
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	1549	// This algorithm is taken from section 7.1 of [SAFEGCD].
				1550	struct poly2 v, r, f, g;
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1551
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	1552	// v = 0
				1553	poly2_zero(&v);
				1554	// r = 1
				1555	poly2_zero(&r);
				1556	r.v[0] = 1;
				1557	// f = all ones.
				1558	OPENSSL_memset(&f, 0xff, sizeof(struct poly2));
				1559	f.v[WORDS_PER_POLY - 1] >>= BITS_PER_WORD - BITS_IN_LAST_WORD;
				1560	// g is the reversal of \|in\|.
				1561	poly2_from_poly(&g, in);
				1562	poly2_mod_phiN(&g);
				1563	poly2_reverse_700(&g, &g);
				1564	int delta = 1;
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1565
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	1566	for (size_t i = 0; i < (2*(N-1)) - 1; i++) {
				1567	poly2_lshift1(&v);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1568
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	1569	const crypto_word_t delta_sign_bit = (delta >> (sizeof(delta) * 8 - 1)) & 1;
				1570	const crypto_word_t delta_is_non_negative = delta_sign_bit - 1;
				1571	const crypto_word_t delta_is_non_zero = ~constant_time_is_zero_w(delta);
				1572	const crypto_word_t g_has_constant_term = lsb_to_all(g.v[0]);
				1573	const crypto_word_t mask =
				1574	g_has_constant_term & delta_is_non_negative & delta_is_non_zero;
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1575
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	1576	const crypto_word_t c = lsb_to_all(f.v[0] & g.v[0]);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1577
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	1578	delta = constant_time_select_int(mask, -delta, delta);
				1579	delta++;
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1580
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	1581	poly2_cswap(&f, &g, mask);
				1582	poly2_fmadd(&g, &f, c);
				1583	poly2_rshift1(&g);
				1584
				1585	poly2_cswap(&v, &r, mask);
				1586	poly2_fmadd(&r, &v, c);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1587	}
				1588
Adam Langley	cbae965	2019-12-06 14:58:52 -0800	[diff] [blame]	1589	assert(delta == 0);
				1590	assert(f.v[0] & 1);
				1591	poly2_reverse_700(&v, &v);
				1592	poly_from_poly2(out, &v);
Adam Langley	f7e1a94	2022-04-01 11:46:32 -0700	[diff] [blame]	1593	poly_assert_normalized(out);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1594	}
				1595
				1596	// poly_invert sets \|out\| to \|in^-1\| (i.e. such that \|out\|×\|in\| = 1 mod Φ(N)).
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	1597	static void poly_invert(struct POLY_MUL_SCRATCH scratch, struct poly out,
				1598	const struct poly *in) {
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1599	// Inversion mod Q, which is done based on the result of inverting mod
				1600	// 2. See [NTRUTN14] paper, bottom of page two.
				1601	struct poly a, *b, tmp;
				1602
				1603	// a = -in.
				1604	for (unsigned i = 0; i < N; i++) {
				1605	a.v[i] = -in->v[i];
				1606	}
Adam Langley	f7e1a94	2022-04-01 11:46:32 -0700	[diff] [blame]	1607	poly_normalize(&a);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1608
				1609	// b = in^-1 mod 2.
				1610	b = out;
				1611	poly_invert_mod2(b, in);
				1612
				1613	// We are working mod Q=2**13 and we need to iterate ceil(log_2(13))
				1614	// times, which is four.
				1615	for (unsigned i = 0; i < 4; i++) {
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	1616	poly_mul(scratch, &tmp, &a, b);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1617	tmp.v[0] += 2;
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	1618	poly_mul(scratch, b, b, &tmp);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1619	}
Adam Langley	f7e1a94	2022-04-01 11:46:32 -0700	[diff] [blame]	1620
				1621	poly_assert_normalized(out);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1622	}
				1623
				1624	// Marshal and unmarshal functions for various basic types.
				1625	// --------------------------------------------------------
				1626
				1627	#define POLY_BYTES 1138
				1628
Adam Langley	9700b44	2018-12-17 11:17:26 -0800	[diff] [blame]	1629	// poly_marshal serialises all but the final coefficient of \|in\| to \|out\|.
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1630	static void poly_marshal(uint8_t out[POLY_BYTES], const struct poly *in) {
				1631	const uint16_t *p = in->v;
				1632
				1633	for (size_t i = 0; i < N / 8; i++) {
				1634	out[0] = p[0];
				1635	out[1] = (0x1f & (p[0] >> 8)) \| ((p[1] & 0x07) << 5);
				1636	out[2] = p[1] >> 3;
				1637	out[3] = (3 & (p[1] >> 11)) \| ((p[2] & 0x3f) << 2);
				1638	out[4] = (0x7f & (p[2] >> 6)) \| ((p[3] & 0x01) << 7);
				1639	out[5] = p[3] >> 1;
				1640	out[6] = (0xf & (p[3] >> 9)) \| ((p[4] & 0x0f) << 4);
				1641	out[7] = p[4] >> 4;
				1642	out[8] = (1 & (p[4] >> 12)) \| ((p[5] & 0x7f) << 1);
				1643	out[9] = (0x3f & (p[5] >> 7)) \| ((p[6] & 0x03) << 6);
				1644	out[10] = p[6] >> 2;
				1645	out[11] = (7 & (p[6] >> 10)) \| ((p[7] & 0x1f) << 3);
				1646	out[12] = p[7] >> 5;
				1647
				1648	p += 8;
				1649	out += 13;
				1650	}
				1651
				1652	// There are four remaining values.
				1653	out[0] = p[0];
				1654	out[1] = (0x1f & (p[0] >> 8)) \| ((p[1] & 0x07) << 5);
				1655	out[2] = p[1] >> 3;
				1656	out[3] = (3 & (p[1] >> 11)) \| ((p[2] & 0x3f) << 2);
				1657	out[4] = (0x7f & (p[2] >> 6)) \| ((p[3] & 0x01) << 7);
				1658	out[5] = p[3] >> 1;
				1659	out[6] = 0xf & (p[3] >> 9);
				1660	}
				1661
Adam Langley	9700b44	2018-12-17 11:17:26 -0800	[diff] [blame]	1662	// poly_unmarshal parses the output of \|poly_marshal\| and sets \|out\| such that
				1663	// all but the final coefficients match, and the final coefficient is calculated
				1664	// such that evaluating \|out\| at one results in zero. It returns one on success
				1665	// or zero if \|in\| is an invalid encoding.
Adam Langley	f8068ce	2018-12-17 11:16:23 -0800	[diff] [blame]	1666	static int poly_unmarshal(struct poly *out, const uint8_t in[POLY_BYTES]) {
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1667	uint16_t *p = out->v;
				1668
				1669	for (size_t i = 0; i < N / 8; i++) {
				1670	p[0] = (uint16_t)(in[0]) \| (uint16_t)(in[1] & 0x1f) << 8;
				1671	p[1] = (uint16_t)(in[1] >> 5) \| (uint16_t)(in[2]) << 3 \|
				1672	(uint16_t)(in[3] & 3) << 11;
				1673	p[2] = (uint16_t)(in[3] >> 2) \| (uint16_t)(in[4] & 0x7f) << 6;
				1674	p[3] = (uint16_t)(in[4] >> 7) \| (uint16_t)(in[5]) << 1 \|
				1675	(uint16_t)(in[6] & 0xf) << 9;
				1676	p[4] = (uint16_t)(in[6] >> 4) \| (uint16_t)(in[7]) << 4 \|
				1677	(uint16_t)(in[8] & 1) << 12;
				1678	p[5] = (uint16_t)(in[8] >> 1) \| (uint16_t)(in[9] & 0x3f) << 7;
				1679	p[6] = (uint16_t)(in[9] >> 6) \| (uint16_t)(in[10]) << 2 \|
				1680	(uint16_t)(in[11] & 7) << 10;
				1681	p[7] = (uint16_t)(in[11] >> 3) \| (uint16_t)(in[12]) << 5;
				1682
				1683	p += 8;
				1684	in += 13;
				1685	}
				1686
				1687	// There are four coefficients remaining.
				1688	p[0] = (uint16_t)(in[0]) \| (uint16_t)(in[1] & 0x1f) << 8;
				1689	p[1] = (uint16_t)(in[1] >> 5) \| (uint16_t)(in[2]) << 3 \|
				1690	(uint16_t)(in[3] & 3) << 11;
				1691	p[2] = (uint16_t)(in[3] >> 2) \| (uint16_t)(in[4] & 0x7f) << 6;
				1692	p[3] = (uint16_t)(in[4] >> 7) \| (uint16_t)(in[5]) << 1 \|
				1693	(uint16_t)(in[6] & 0xf) << 9;
				1694
				1695	for (unsigned i = 0; i < N - 1; i++) {
				1696	out->v[i] = (int16_t)(out->v[i] << 3) >> 3;
				1697	}
				1698
Adam Langley	f8068ce	2018-12-17 11:16:23 -0800	[diff] [blame]	1699	// There are four unused bits in the last byte. We require them to be zero.
				1700	if ((in[6] & 0xf0) != 0) {
				1701	return 0;
				1702	}
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1703
				1704	// Set the final coefficient as specifed in [HRSSNIST] 1.9.2 step 6.
				1705	uint32_t sum = 0;
				1706	for (size_t i = 0; i < N - 1; i++) {
				1707	sum += out->v[i];
				1708	}
				1709
				1710	out->v[N - 1] = (uint16_t)(0u - sum);
Adam Langley	f7e1a94	2022-04-01 11:46:32 -0700	[diff] [blame]	1711	poly_normalize(out);
Adam Langley	f8068ce	2018-12-17 11:16:23 -0800	[diff] [blame]	1712
				1713	return 1;
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1714	}
				1715
				1716	// mod3_from_modQ maps {0, 1, Q-1, 65535} -> {0, 1, 2, 2}. Note that \|v\| may
				1717	// have an invalid value when processing attacker-controlled inputs.
				1718	static uint16_t mod3_from_modQ(uint16_t v) {
				1719	v &= 3;
				1720	return v ^ (v >> 1);
				1721	}
				1722
				1723	// poly_marshal_mod3 marshals \|in\| to \|out\| where the coefficients of \|in\| are
				1724	// all in {0, 1, Q-1, 65535} and \|in\| is mod Φ(N). (Note that coefficients may
				1725	// have invalid values when processing attacker-controlled inputs.)
				1726	static void poly_marshal_mod3(uint8_t out[HRSS_POLY3_BYTES],
				1727	const struct poly *in) {
				1728	const uint16_t *coeffs = in->v;
				1729
				1730	// Only 700 coefficients are marshaled because in[700] must be zero.
				1731	assert(coeffs[N-1] == 0);
				1732
				1733	for (size_t i = 0; i < HRSS_POLY3_BYTES; i++) {
				1734	const uint16_t coeffs0 = mod3_from_modQ(coeffs[0]);
				1735	const uint16_t coeffs1 = mod3_from_modQ(coeffs[1]);
				1736	const uint16_t coeffs2 = mod3_from_modQ(coeffs[2]);
				1737	const uint16_t coeffs3 = mod3_from_modQ(coeffs[3]);
				1738	const uint16_t coeffs4 = mod3_from_modQ(coeffs[4]);
				1739	out[i] = coeffs0 + coeffs1 * 3 + coeffs2 * 9 + coeffs3 * 27 + coeffs4 * 81;
				1740	coeffs += 5;
				1741	}
				1742	}
				1743
				1744	// HRSS-specific functions
				1745	// -----------------------
				1746
Adam Langley	72f0155	2019-01-21 12:53:43 -0800	[diff] [blame]	1747	// poly_short_sample samples a vector of values in {0xffff (i.e. -1), 0, 1}.
				1748	// This is the same action as the algorithm in [HRSSNIST] section 1.8.1, but
				1749	// with HRSS-SXY the sampling algorithm is now a private detail of the
				1750	// implementation (previously it had to match between two parties). This
				1751	// function uses that freedom to implement a flatter distribution of values.
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1752	static void poly_short_sample(struct poly *out,
				1753	const uint8_t in[HRSS_SAMPLE_BYTES]) {
David Benjamin	b7d6320	2022-07-26 13:25:02 -0700	[diff] [blame]	1754	static_assert(HRSS_SAMPLE_BYTES == N - 1, "HRSS_SAMPLE_BYTES incorrect");
Adam Langley	72f0155	2019-01-21 12:53:43 -0800	[diff] [blame]	1755	for (size_t i = 0; i < N - 1; i++) {
				1756	uint16_t v = mod3(in[i]);
				1757	// Map {0, 1, 2} -> {0, 1, 0xffff}
				1758	v \|= ((v >> 1) ^ 1) - 1;
				1759	out->v[i] = v;
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1760	}
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1761	out->v[N - 1] = 0;
Adam Langley	f7e1a94	2022-04-01 11:46:32 -0700	[diff] [blame]	1762	poly_normalize(out);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1763	}
				1764
				1765	// poly_short_sample_plus performs the T+ sample as defined in [HRSSNIST],
				1766	// section 1.8.2.
				1767	static void poly_short_sample_plus(struct poly *out,
				1768	const uint8_t in[HRSS_SAMPLE_BYTES]) {
				1769	poly_short_sample(out, in);
				1770
				1771	// sum (and the product in the for loop) will overflow. But that's fine
				1772	// because \|sum\| is bound by +/- (N-2), and N < 2^15 so it works out.
				1773	uint16_t sum = 0;
				1774	for (unsigned i = 0; i < N - 2; i++) {
				1775	sum += (unsigned) out->v[i] * out->v[i + 1];
				1776	}
				1777
				1778	// If the sum is negative, flip the sign of even-positioned coefficients. (See
				1779	// page 8 of [HRSS].)
				1780	sum = ((int16_t) sum) >> 15;
				1781	const uint16_t scale = sum \| (~sum & 1);
				1782	for (unsigned i = 0; i < N; i += 2) {
				1783	out->v[i] = (unsigned) out->v[i] * scale;
				1784	}
Adam Langley	f7e1a94	2022-04-01 11:46:32 -0700	[diff] [blame]	1785	poly_assert_normalized(out);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1786	}
				1787
				1788	// poly_lift computes the function discussed in [HRSS], appendix B.
				1789	static void poly_lift(struct poly out, const struct poly a) {
				1790	// We wish to calculate a/(𝑥-1) mod Φ(N) over GF(3), where Φ(N) is the
				1791	// Nth cyclotomic polynomial, i.e. 1 + 𝑥 + … + 𝑥^700 (since N is prime).
				1792
				1793	// 1/(𝑥-1) has a fairly basic structure that we can exploit to speed this up:
				1794	//
				1795	// R.<x> = PolynomialRing(GF(3)…)
				1796	// inv = R.cyclotomic_polynomial(1).inverse_mod(R.cyclotomic_polynomial(n))
				1797	// list(inv)[:15]
				1798	// [1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2]
				1799	//
				1800	// This three-element pattern of coefficients repeats for the whole
				1801	// polynomial.
				1802	//
				1803	// Next define the overbar operator such that z̅ = z[0] +
				1804	// reverse(z[1:]). (Index zero of a polynomial here is the coefficient
				1805	// of the constant term. So index one is the coefficient of 𝑥 and so
				1806	// on.)
				1807	//
				1808	// A less odd way to define this is to see that z̅ negates the indexes,
				1809	// so z̅[0] = z[-0], z̅[1] = z[-1] and so on.
				1810	//
				1811	// The use of z̅ is that, when working mod (𝑥^701 - 1), vz[0] = <v,
				1812	// z̅>, vz[1] = <v, 𝑥z̅>, …. (Where <a, b> is the inner product: the sum
				1813	// of the point-wise products.) Although we calculated the inverse mod
				1814	// Φ(N), we can work mod (𝑥^N - 1) and reduce mod Φ(N) at the end.
				1815	// (That's because (𝑥^N - 1) is a multiple of Φ(N).)
				1816	//
				1817	// When working mod (𝑥^N - 1), multiplication by 𝑥 is a right-rotation
				1818	// of the list of coefficients.
				1819	//
				1820	// Thus we can consider what the pattern of z̅, 𝑥z̅, 𝑥^2z̅, … looks like:
				1821	//
				1822	// def reverse(xs):
				1823	// suffix = list(xs[1:])
				1824	// suffix.reverse()
				1825	// return [xs[0]] + suffix
				1826	//
				1827	// def rotate(xs):
				1828	// return [xs[-1]] + xs[:-1]
				1829	//
				1830	// zoverbar = reverse(list(inv) + [0])
				1831	// xzoverbar = rotate(reverse(list(inv) + [0]))
				1832	// x2zoverbar = rotate(rotate(reverse(list(inv) + [0])))
				1833	//
				1834	// zoverbar[:15]
				1835	// [1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1]
				1836	// xzoverbar[:15]
				1837	// [0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0]
				1838	// x2zoverbar[:15]
				1839	// [2, 0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]
				1840	//
				1841	// (For a formula for z̅, see lemma two of appendix B.)
				1842	//
				1843	// After the first three elements have been taken care of, all then have
				1844	// a repeating three-element cycle. The next value (𝑥^3z̅) involves
				1845	// three rotations of the first pattern, thus the three-element cycle
				1846	// lines up. However, the discontinuity in the first three elements
				1847	// obviously moves to a different position. Consider the difference
				1848	// between 𝑥^3z̅ and z̅:
				1849	//
				1850	// [x-y for (x,y) in zip(zoverbar, x3zoverbar)][:15]
				1851	// [0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
				1852	//
				1853	// This pattern of differences is the same for all elements, although it
				1854	// obviously moves right with the rotations.
				1855	//
				1856	// From this, we reach algorithm eight of appendix B.
				1857
				1858	// Handle the first three elements of the inner products.
				1859	out->v[0] = a->v[0] + a->v[2];
				1860	out->v[1] = a->v[1];
				1861	out->v[2] = -a->v[0] + a->v[2];
				1862
				1863	// s0, s1, s2 are added into out->v[0], out->v[1], and out->v[2],
				1864	// respectively. We do not compute s1 because it's just -(s0 + s1).
				1865	uint16_t s0 = 0, s2 = 0;
				1866	for (size_t i = 3; i < 699; i += 3) {
				1867	s0 += -a->v[i] + a->v[i + 2];
				1868	// s1 += a->v[i] - a->v[i + 1];
				1869	s2 += a->v[i + 1] - a->v[i + 2];
				1870	}
				1871
				1872	// Handle the fact that the three-element pattern doesn't fill the
				1873	// polynomial exactly (since 701 isn't a multiple of three).
				1874	s0 -= a->v[699];
				1875	// s1 += a->v[699] - a->v[700];
				1876	s2 += a->v[700];
				1877
				1878	// Note that s0 + s1 + s2 = 0.
				1879	out->v[0] += s0;
				1880	out->v[1] -= (s0 + s2); // = s1
				1881	out->v[2] += s2;
				1882
				1883	// Calculate the remaining inner products by taking advantage of the
				1884	// fact that the pattern repeats every three cycles and the pattern of
				1885	// differences moves with the rotation.
				1886	for (size_t i = 3; i < N; i++) {
				1887	out->v[i] = (out->v[i - 3] - (a->v[i - 2] + a->v[i - 1] + a->v[i]));
				1888	}
				1889
				1890	// Reduce mod Φ(N) by subtracting a multiple of out[700] from every
				1891	// element and convert to mod Q. (See above about adding twice as
				1892	// subtraction.)
				1893	const crypto_word_t v = out->v[700];
				1894	for (unsigned i = 0; i < N; i++) {
				1895	const uint16_t vi_mod3 = mod3(out->v[i] - v);
				1896	// Map {0, 1, 2} to {0, 1, 0xffff}.
				1897	out->v[i] = (~((vi_mod3 >> 1) - 1)) \| vi_mod3;
				1898	}
				1899
				1900	poly_mul_x_minus_1(out);
Adam Langley	f7e1a94	2022-04-01 11:46:32 -0700	[diff] [blame]	1901	poly_normalize(out);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1902	}
				1903
				1904	struct public_key {
				1905	struct poly ph;
				1906	};
				1907
				1908	struct private_key {
				1909	struct poly3 f, f_inverse;
				1910	struct poly ph_inverse;
				1911	uint8_t hmac_key[32];
				1912	};
				1913
				1914	// public_key_from_external converts an external public key pointer into an
				1915	// internal one. Externally the alignment is only specified to be eight bytes
				1916	// but we need 16-byte alignment. We could annotate the external struct with
				1917	// that alignment but we can only assume that malloced pointers are 8-byte
				1918	// aligned in any case. (Even if the underlying malloc returns values with
				1919	// 16-byte alignment, \|OPENSSL_malloc\| will store an 8-byte size prefix and mess
				1920	// that up.)
				1921	static struct public_key *public_key_from_external(
				1922	struct HRSS_public_key *ext) {
David Benjamin	b7d6320	2022-07-26 13:25:02 -0700	[diff] [blame]	1923	static_assert(
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1924	sizeof(struct HRSS_public_key) >= sizeof(struct public_key) + 15,
				1925	"HRSS public key too small");
				1926
David Benjamin	ecc301c	2021-07-02 12:30:40 -0400	[diff] [blame]	1927	return align_pointer(ext->opaque, 16);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1928	}
				1929
				1930	// private_key_from_external does the same thing as \|public_key_from_external\|,
				1931	// but for private keys. See the comment on that function about alignment
				1932	// issues.
				1933	static struct private_key *private_key_from_external(
				1934	struct HRSS_private_key *ext) {
David Benjamin	b7d6320	2022-07-26 13:25:02 -0700	[diff] [blame]	1935	static_assert(
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1936	sizeof(struct HRSS_private_key) >= sizeof(struct private_key) + 15,
				1937	"HRSS private key too small");
				1938
David Benjamin	ecc301c	2021-07-02 12:30:40 -0400	[diff] [blame]	1939	return align_pointer(ext->opaque, 16);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1940	}
				1941
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	1942	// malloc_align32 returns a pointer to \|size\| bytes of 32-byte-aligned heap and
				1943	// sets \|*out_ptr\| to a value that can be passed to \|OPENSSL_free\| to release
				1944	// it. It returns NULL if out of memory.
				1945	static void malloc_align32(void *out_ptr, size_t size) {
				1946	void *ptr = OPENSSL_malloc(size + 31);
				1947	if (!ptr) {
				1948	*out_ptr = NULL;
				1949	return NULL;
				1950	}
				1951
				1952	*out_ptr = ptr;
				1953	return align_pointer(ptr, 32);
				1954	}
				1955
				1956	int HRSS_generate_key(
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1957	struct HRSS_public_key out_pub, struct HRSS_private_key out_priv,
				1958	const uint8_t in[HRSS_SAMPLE_BYTES + HRSS_SAMPLE_BYTES + 32]) {
				1959	struct public_key *pub = public_key_from_external(out_pub);
				1960	struct private_key *priv = private_key_from_external(out_priv);
				1961
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	1962	struct vars {
				1963	struct POLY_MUL_SCRATCH scratch;
				1964	struct poly f;
				1965	struct poly pg_phi1;
				1966	struct poly pfg_phi1;
				1967	struct poly pfg_phi1_inverse;
				1968	};
				1969
				1970	void *malloc_ptr;
				1971	struct vars *const vars = malloc_align32(&malloc_ptr, sizeof(struct vars));
				1972	if (!vars) {
				1973	// If the caller ignores the return value the output will still be safe.
				1974	// The private key output is randomised in case it's later passed to
				1975	// \|HRSS_encap\|.
				1976	memset(out_pub, 0, sizeof(struct HRSS_public_key));
				1977	RAND_bytes((uint8_t*) out_priv, sizeof(struct HRSS_private_key));
				1978	return 0;
				1979	}
				1980
Adam Langley	f7e1a94	2022-04-01 11:46:32 -0700	[diff] [blame]	1981	#if !defined(NDEBUG)
				1982	OPENSSL_memset(vars, 0xff, sizeof(struct vars));
				1983	#endif
				1984
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1985	OPENSSL_memcpy(priv->hmac_key, in + 2 * HRSS_SAMPLE_BYTES,
				1986	sizeof(priv->hmac_key));
				1987
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	1988	poly_short_sample_plus(&vars->f, in);
				1989	poly3_from_poly(&priv->f, &vars->f);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1990	HRSS_poly3_invert(&priv->f_inverse, &priv->f);
				1991
				1992	// pg_phi1 is p (i.e. 3) × g × Φ(1) (i.e. 𝑥-1).
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	1993	poly_short_sample_plus(&vars->pg_phi1, in + HRSS_SAMPLE_BYTES);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1994	for (unsigned i = 0; i < N; i++) {
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	1995	vars->pg_phi1.v[i] *= 3;
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1996	}
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	1997	poly_mul_x_minus_1(&vars->pg_phi1);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	1998
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	1999	poly_mul(&vars->scratch, &vars->pfg_phi1, &vars->f, &vars->pg_phi1);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2000
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2001	poly_invert(&vars->scratch, &vars->pfg_phi1_inverse, &vars->pfg_phi1);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2002
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2003	poly_mul(&vars->scratch, &pub->ph, &vars->pfg_phi1_inverse, &vars->pg_phi1);
				2004	poly_mul(&vars->scratch, &pub->ph, &pub->ph, &vars->pg_phi1);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2005	poly_clamp(&pub->ph);
				2006
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2007	poly_mul(&vars->scratch, &priv->ph_inverse, &vars->pfg_phi1_inverse,
				2008	&vars->f);
				2009	poly_mul(&vars->scratch, &priv->ph_inverse, &priv->ph_inverse, &vars->f);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2010	poly_clamp(&priv->ph_inverse);
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2011
				2012	OPENSSL_free(malloc_ptr);
				2013	return 1;
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2014	}
				2015
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2016	static const char kSharedKey[] = "shared key";
				2017
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2018	int HRSS_encap(uint8_t out_ciphertext[POLY_BYTES], uint8_t out_shared_key[32],
				2019	const struct HRSS_public_key *in_pub,
				2020	const uint8_t in[HRSS_SAMPLE_BYTES + HRSS_SAMPLE_BYTES]) {
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2021	const struct public_key *pub =
				2022	public_key_from_external((struct HRSS_public_key *)in_pub);
Adam Langley	9700b44	2018-12-17 11:17:26 -0800	[diff] [blame]	2023
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2024	struct vars {
				2025	struct POLY_MUL_SCRATCH scratch;
				2026	struct poly m, r, m_lifted;
				2027	struct poly prh_plus_m;
				2028	SHA256_CTX hash_ctx;
				2029	uint8_t m_bytes[HRSS_POLY3_BYTES];
				2030	uint8_t r_bytes[HRSS_POLY3_BYTES];
				2031	};
				2032
				2033	void *malloc_ptr;
				2034	struct vars *const vars = malloc_align32(&malloc_ptr, sizeof(struct vars));
				2035	if (!vars) {
				2036	// If the caller ignores the return value the output will still be safe.
				2037	// The private key output is randomised in case it's used to encrypt and
				2038	// transmit something.
				2039	memset(out_ciphertext, 0, POLY_BYTES);
				2040	RAND_bytes(out_shared_key, 32);
				2041	return 0;
Adam Langley	9700b44	2018-12-17 11:17:26 -0800	[diff] [blame]	2042	}
				2043
Adam Langley	f7e1a94	2022-04-01 11:46:32 -0700	[diff] [blame]	2044	#if !defined(NDEBUG)
				2045	OPENSSL_memset(vars, 0xff, sizeof(struct vars));
				2046	#endif
				2047
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2048	poly_short_sample(&vars->m, in);
				2049	poly_short_sample(&vars->r, in + HRSS_SAMPLE_BYTES);
				2050	poly_lift(&vars->m_lifted, &vars->m);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2051
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2052	poly_mul(&vars->scratch, &vars->prh_plus_m, &vars->r, &pub->ph);
				2053	for (unsigned i = 0; i < N; i++) {
				2054	vars->prh_plus_m.v[i] += vars->m_lifted.v[i];
				2055	}
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2056
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2057	poly_marshal(out_ciphertext, &vars->prh_plus_m);
				2058
				2059	poly_marshal_mod3(vars->m_bytes, &vars->m);
				2060	poly_marshal_mod3(vars->r_bytes, &vars->r);
				2061
				2062	SHA256_Init(&vars->hash_ctx);
				2063	SHA256_Update(&vars->hash_ctx, kSharedKey, sizeof(kSharedKey));
				2064	SHA256_Update(&vars->hash_ctx, vars->m_bytes, sizeof(vars->m_bytes));
				2065	SHA256_Update(&vars->hash_ctx, vars->r_bytes, sizeof(vars->r_bytes));
				2066	SHA256_Update(&vars->hash_ctx, out_ciphertext, POLY_BYTES);
				2067	SHA256_Final(out_shared_key, &vars->hash_ctx);
				2068
				2069	OPENSSL_free(malloc_ptr);
				2070	return 1;
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2071	}
				2072
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2073	int HRSS_decap(uint8_t out_shared_key[HRSS_KEY_BYTES],
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2074	const struct HRSS_private_key *in_priv,
				2075	const uint8_t *ciphertext, size_t ciphertext_len) {
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2076	const struct private_key *priv =
				2077	private_key_from_external((struct HRSS_private_key *)in_priv);
				2078
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2079	struct vars {
				2080	struct POLY_MUL_SCRATCH scratch;
				2081	uint8_t masked_key[SHA256_CBLOCK];
				2082	SHA256_CTX hash_ctx;
				2083	struct poly c;
				2084	struct poly f, cf;
				2085	struct poly3 cf3, m3;
				2086	struct poly m, m_lifted;
				2087	struct poly r;
				2088	struct poly3 r3;
				2089	uint8_t expected_ciphertext[HRSS_CIPHERTEXT_BYTES];
				2090	uint8_t m_bytes[HRSS_POLY3_BYTES];
				2091	uint8_t r_bytes[HRSS_POLY3_BYTES];
				2092	uint8_t shared_key[32];
				2093	};
				2094
				2095	void *malloc_ptr;
				2096	struct vars *const vars = malloc_align32(&malloc_ptr, sizeof(struct vars));
				2097	if (!vars) {
				2098	// If the caller ignores the return value the output will still be safe.
				2099	// The private key output is randomised in case it's used to encrypt and
				2100	// transmit something.
				2101	RAND_bytes(out_shared_key, HRSS_KEY_BYTES);
				2102	return 0;
				2103	}
				2104
Adam Langley	f7e1a94	2022-04-01 11:46:32 -0700	[diff] [blame]	2105	#if !defined(NDEBUG)
				2106	OPENSSL_memset(vars, 0xff, sizeof(struct vars));
				2107	#endif
				2108
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2109	// This is HMAC, expanded inline rather than using the \|HMAC\| function so that
				2110	// we can avoid dealing with possible allocation failures and so keep this
				2111	// function infallible.
David Benjamin	b7d6320	2022-07-26 13:25:02 -0700	[diff] [blame]	2112	static_assert(sizeof(priv->hmac_key) <= sizeof(vars->masked_key),
				2113	"HRSS HMAC key larger than SHA-256 block size");
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2114	for (size_t i = 0; i < sizeof(priv->hmac_key); i++) {
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2115	vars->masked_key[i] = priv->hmac_key[i] ^ 0x36;
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2116	}
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2117	OPENSSL_memset(vars->masked_key + sizeof(priv->hmac_key), 0x36,
				2118	sizeof(vars->masked_key) - sizeof(priv->hmac_key));
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2119
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2120	SHA256_Init(&vars->hash_ctx);
				2121	SHA256_Update(&vars->hash_ctx, vars->masked_key, sizeof(vars->masked_key));
				2122	SHA256_Update(&vars->hash_ctx, ciphertext, ciphertext_len);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2123	uint8_t inner_digest[SHA256_DIGEST_LENGTH];
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2124	SHA256_Final(inner_digest, &vars->hash_ctx);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2125
				2126	for (size_t i = 0; i < sizeof(priv->hmac_key); i++) {
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2127	vars->masked_key[i] ^= (0x5c ^ 0x36);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2128	}
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2129	OPENSSL_memset(vars->masked_key + sizeof(priv->hmac_key), 0x5c,
				2130	sizeof(vars->masked_key) - sizeof(priv->hmac_key));
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2131
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2132	SHA256_Init(&vars->hash_ctx);
				2133	SHA256_Update(&vars->hash_ctx, vars->masked_key, sizeof(vars->masked_key));
				2134	SHA256_Update(&vars->hash_ctx, inner_digest, sizeof(inner_digest));
David Benjamin	b7d6320	2022-07-26 13:25:02 -0700	[diff] [blame]	2135	static_assert(HRSS_KEY_BYTES == SHA256_DIGEST_LENGTH,
				2136	"HRSS shared key length incorrect");
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2137	SHA256_Final(out_shared_key, &vars->hash_ctx);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2138
				2139	// If the ciphertext is publicly invalid then a random shared key is still
				2140	// returned to simply the logic of the caller, but this path is not constant
				2141	// time.
Adam Langley	f8068ce	2018-12-17 11:16:23 -0800	[diff] [blame]	2142	if (ciphertext_len != HRSS_CIPHERTEXT_BYTES \|\|
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2143	!poly_unmarshal(&vars->c, ciphertext)) {
				2144	goto out;
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2145	}
				2146
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2147	poly_from_poly3(&vars->f, &priv->f);
				2148	poly_mul(&vars->scratch, &vars->cf, &vars->c, &vars->f);
				2149	poly3_from_poly(&vars->cf3, &vars->cf);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2150	// Note that cf3 is not reduced mod Φ(N). That reduction is deferred.
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2151	HRSS_poly3_mul(&vars->m3, &vars->cf3, &priv->f_inverse);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2152
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2153	poly_from_poly3(&vars->m, &vars->m3);
				2154	poly_lift(&vars->m_lifted, &vars->m);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2155
				2156	for (unsigned i = 0; i < N; i++) {
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2157	vars->r.v[i] = vars->c.v[i] - vars->m_lifted.v[i];
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2158	}
Adam Langley	f7e1a94	2022-04-01 11:46:32 -0700	[diff] [blame]	2159	poly_normalize(&vars->r);
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2160	poly_mul(&vars->scratch, &vars->r, &vars->r, &priv->ph_inverse);
				2161	poly_mod_phiN(&vars->r);
				2162	poly_clamp(&vars->r);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2163
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2164	crypto_word_t ok = poly3_from_poly_checked(&vars->r3, &vars->r);
Adam Langley	9700b44	2018-12-17 11:17:26 -0800	[diff] [blame]	2165
				2166	// [NTRUCOMP] section 5.1 includes ReEnc2 and a proof that it's valid. Rather
				2167	// than do an expensive \|poly_mul\|, it rebuilds \|c'\| from \|c - lift(m)\|
				2168	// (called \|b\|) with:
				2169	// t = (−b(1)/N) mod Q
				2170	// c' = b + tΦ(N) + lift(m) mod Q
				2171	//
				2172	// When polynomials are transmitted, the final coefficient is omitted and
				2173	// \|poly_unmarshal\| sets it such that f(1) == 0. Thus c(1) == 0. Also,
				2174	// \|poly_lift\| multiplies the result by (x-1) and therefore evaluating a
				2175	// lifted polynomial at 1 is also zero. Thus lift(m)(1) == 0 and so
				2176	// (c - lift(m))(1) == 0.
				2177	//
				2178	// Although we defer the reduction above, \|b\| is conceptually reduced mod
				2179	// Φ(N). In order to do that reduction one subtracts \|c[N-1]\| from every
				2180	// coefficient. Therefore b(1) = -c[N-1]×N. The value of \|t\|, above, then is
				2181	// just recovering \|c[N-1]\|, and adding tΦ(N) is simply undoing the reduction.
				2182	// Therefore b + tΦ(N) + lift(m) = c by construction and we don't need to
				2183	// recover \|c\| at all so long as we do the checks in
				2184	// \|poly3_from_poly_checked\|.
				2185	//
				2186	// The \|poly_marshal\| here then is just confirming that \|poly_unmarshal\| is
				2187	// strict and could be omitted.
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2188
David Benjamin	b7d6320	2022-07-26 13:25:02 -0700	[diff] [blame]	2189	static_assert(HRSS_CIPHERTEXT_BYTES == POLY_BYTES,
				2190	"ciphertext is the wrong size");
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2191	assert(ciphertext_len == sizeof(vars->expected_ciphertext));
				2192	poly_marshal(vars->expected_ciphertext, &vars->c);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2193
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2194	poly_marshal_mod3(vars->m_bytes, &vars->m);
				2195	poly_marshal_mod3(vars->r_bytes, &vars->r);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2196
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2197	ok &= constant_time_is_zero_w(
				2198	CRYPTO_memcmp(ciphertext, vars->expected_ciphertext,
				2199	sizeof(vars->expected_ciphertext)));
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2200
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2201	SHA256_Init(&vars->hash_ctx);
				2202	SHA256_Update(&vars->hash_ctx, kSharedKey, sizeof(kSharedKey));
				2203	SHA256_Update(&vars->hash_ctx, vars->m_bytes, sizeof(vars->m_bytes));
				2204	SHA256_Update(&vars->hash_ctx, vars->r_bytes, sizeof(vars->r_bytes));
				2205	SHA256_Update(&vars->hash_ctx, vars->expected_ciphertext,
				2206	sizeof(vars->expected_ciphertext));
				2207	SHA256_Final(vars->shared_key, &vars->hash_ctx);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2208
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2209	for (unsigned i = 0; i < sizeof(vars->shared_key); i++) {
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2210	out_shared_key[i] =
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2211	constant_time_select_8(ok, vars->shared_key[i], out_shared_key[i]);
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2212	}
Adam Langley	7153013	2021-06-30 16:33:47 -0700	[diff] [blame]	2213
				2214	out:
				2215	OPENSSL_free(malloc_ptr);
				2216	return 1;
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2217	}
				2218
				2219	void HRSS_marshal_public_key(uint8_t out[HRSS_PUBLIC_KEY_BYTES],
				2220	const struct HRSS_public_key *in_pub) {
				2221	const struct public_key *pub =
				2222	public_key_from_external((struct HRSS_public_key *)in_pub);
				2223	poly_marshal(out, &pub->ph);
				2224	}
				2225
				2226	int HRSS_parse_public_key(struct HRSS_public_key *out,
				2227	const uint8_t in[HRSS_PUBLIC_KEY_BYTES]) {
				2228	struct public_key *pub = public_key_from_external(out);
Adam Langley	f8068ce	2018-12-17 11:16:23 -0800	[diff] [blame]	2229	if (!poly_unmarshal(&pub->ph, in)) {
				2230	return 0;
				2231	}
Adam Langley	7b93593	2018-11-12 13:53:42 -0800	[diff] [blame]	2232	OPENSSL_memset(&pub->ph.v[N], 0, 3 * sizeof(uint16_t));
				2233	return 1;
				2234	}