Blame - crypto/ec/p256-64.c - boringssl.git

blob: a1d278e1e9454c853094e26bc270d50c952fb35d [file] [log] [blame]

Adam Langley	ad6b28e	2015-04-14 12:07:44 -0700	[diff] [blame^]	1	/* Copyright (c) 2015, Google Inc.
				2	*
				3	* Permission to use, copy, modify, and/or distribute this software for any
				4	* purpose with or without fee is hereby granted, provided that the above
				5	* copyright notice and this permission notice appear in all copies.
				6	*
				7	* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
				8	* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
				9	* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
				10	* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				11	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
				12	* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
				13	* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
				14
				15	/* A 64-bit implementation of the NIST P-256 elliptic curve point
				16	* multiplication
				17	*
				18	* OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
				19	* Otherwise based on Emilia's P224 work, which was inspired by my curve25519
				20	* work which got its smarts from Daniel J. Bernstein's work on the same. */
				21
				22	#include <openssl/base.h>
				23
				24	#if defined(OPENSSL_64_BIT) && !defined(OPENSSL_WINDOWS)
				25
				26	#include <openssl/bn.h>
				27	#include <openssl/ec.h>
				28	#include <openssl/err.h>
				29	#include <openssl/mem.h>
				30	#include <openssl/obj.h>
				31
				32	#include <string.h>
				33
				34	#include "internal.h"
				35
				36
				37	typedef uint8_t u8;
				38	typedef uint64_t u64;
				39	typedef int64_t s64;
				40	typedef __uint128_t uint128_t;
				41	typedef __int128_t int128_t;
				42
				43	/* The underlying field. P256 operates over GF(2^256-2^224+2^192+2^96-1). We
				44	* can serialise an element of this field into 32 bytes. We call this an
				45	* felem_bytearray. */
				46	typedef u8 felem_bytearray[32];
				47
				48	/* These are the parameters of P256, taken from FIPS 186-3, page 86. These
				49	* values are big-endian. */
				50	static const felem_bytearray nistp256_curve_params[5] = {
				51	{0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */
				52	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
				53	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
				54	{0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */
				55	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
				56	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				57	0xfc}, /* b */
				58	{0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7, 0xb3, 0xeb, 0xbd, 0x55,
				59	0x76, 0x98, 0x86, 0xbc, 0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6,
				60	0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b},
				61	{0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */
				62	0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2, 0x77, 0x03, 0x7d, 0x81,
				63	0x2d, 0xeb, 0x33, 0xa0, 0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96},
				64	{0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */
				65	0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16, 0x2b, 0xce, 0x33, 0x57,
				66	0x6b, 0x31, 0x5e, 0xce, 0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5}};
				67
				68	/* The representation of field elements.
				69	* ------------------------------------
				70	*
				71	* We represent field elements with either four 128-bit values, eight 128-bit
				72	* values, or four 64-bit values. The field element represented is:
				73	* v[0]2^0 + v[1]2^64 + v[2]2^128 + v[3]2^192 (mod p)
				74	* or:
				75	* v[0]2^0 + v[1]2^64 + v[2]2^128 + ... + v[8]2^512 (mod p)
				76	*
				77	* 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits
				78	* apart, but are 128-bits wide, the most significant bits of each limb overlap
				79	* with the least significant bits of the next.
				80	*
				81	* A field element with four limbs is an 'felem'. One with eight limbs is a
				82	* 'longfelem'
				83	*
				84	* A field element with four, 64-bit values is called a 'smallfelem'. Small
				85	* values are used as intermediate values before multiplication. */
				86
				87	#define NLIMBS 4
				88
				89	typedef uint128_t limb;
				90	typedef limb felem[NLIMBS];
				91	typedef limb longfelem[NLIMBS * 2];
				92	typedef u64 smallfelem[NLIMBS];
				93
				94	/* This is the value of the prime as four 64-bit words, little-endian. */
				95	static const u64 kPrime[4] = {0xfffffffffffffffful, 0xffffffff, 0,
				96	0xffffffff00000001ul};
				97	static const u64 bottom63bits = 0x7ffffffffffffffful;
				98
				99	/* bin32_to_felem takes a little-endian byte array and converts it into felem
				100	* form. This assumes that the CPU is little-endian. */
				101	static void bin32_to_felem(felem out, const u8 in[32]) {
				102	out[0] = ((u64 )&in[0]);
				103	out[1] = ((u64 )&in[8]);
				104	out[2] = ((u64 )&in[16]);
				105	out[3] = ((u64 )&in[24]);
				106	}
				107
				108	/* smallfelem_to_bin32 takes a smallfelem and serialises into a little endian,
				109	* 32 byte array. This assumes that the CPU is little-endian. */
				110	static void smallfelem_to_bin32(u8 out[32], const smallfelem in) {
				111	((u64 )&out[0]) = in[0];
				112	((u64 )&out[8]) = in[1];
				113	((u64 )&out[16]) = in[2];
				114	((u64 )&out[24]) = in[3];
				115	}
				116
				117	/* To preserve endianness when using BN_bn2bin and BN_bin2bn. */
				118	static void flip_endian(u8 out, const u8 in, unsigned len) {
				119	unsigned i;
				120	for (i = 0; i < len; ++i) {
				121	out[i] = in[len - 1 - i];
				122	}
				123	}
				124
				125	/* BN_to_felem converts an OpenSSL BIGNUM into an felem. */
				126	static int BN_to_felem(felem out, const BIGNUM *bn) {
				127	if (BN_is_negative(bn)) {
				128	OPENSSL_PUT_ERROR(EC, BN_to_felem, EC_R_BIGNUM_OUT_OF_RANGE);
				129	return 0;
				130	}
				131
				132	felem_bytearray b_out;
				133	/* BN_bn2bin eats leading zeroes */
				134	memset(b_out, 0, sizeof(b_out));
				135	unsigned num_bytes = BN_num_bytes(bn);
				136	if (num_bytes > sizeof(b_out)) {
				137	OPENSSL_PUT_ERROR(EC, BN_to_felem, EC_R_BIGNUM_OUT_OF_RANGE);
				138	return 0;
				139	}
				140
				141	felem_bytearray b_in;
				142	num_bytes = BN_bn2bin(bn, b_in);
				143	flip_endian(b_out, b_in, num_bytes);
				144	bin32_to_felem(out, b_out);
				145	return 1;
				146	}
				147
				148	/* felem_to_BN converts an felem into an OpenSSL BIGNUM. */
				149	static BIGNUM smallfelem_to_BN(BIGNUM out, const smallfelem in) {
				150	felem_bytearray b_in, b_out;
				151	smallfelem_to_bin32(b_in, in);
				152	flip_endian(b_out, b_in, sizeof(b_out));
				153	return BN_bin2bn(b_out, sizeof(b_out), out);
				154	}
				155
				156	/* Field operations. */
				157
				158	static void smallfelem_one(smallfelem out) {
				159	out[0] = 1;
				160	out[1] = 0;
				161	out[2] = 0;
				162	out[3] = 0;
				163	}
				164
				165	static void smallfelem_assign(smallfelem out, const smallfelem in) {
				166	out[0] = in[0];
				167	out[1] = in[1];
				168	out[2] = in[2];
				169	out[3] = in[3];
				170	}
				171
				172	static void felem_assign(felem out, const felem in) {
				173	out[0] = in[0];
				174	out[1] = in[1];
				175	out[2] = in[2];
				176	out[3] = in[3];
				177	}
				178
				179	/* felem_sum sets out = out + in. */
				180	static void felem_sum(felem out, const felem in) {
				181	out[0] += in[0];
				182	out[1] += in[1];
				183	out[2] += in[2];
				184	out[3] += in[3];
				185	}
				186
				187	/* felem_small_sum sets out = out + in. */
				188	static void felem_small_sum(felem out, const smallfelem in) {
				189	out[0] += in[0];
				190	out[1] += in[1];
				191	out[2] += in[2];
				192	out[3] += in[3];
				193	}
				194
				195	/* felem_scalar sets out = out * scalar */
				196	static void felem_scalar(felem out, const u64 scalar) {
				197	out[0] *= scalar;
				198	out[1] *= scalar;
				199	out[2] *= scalar;
				200	out[3] *= scalar;
				201	}
				202
				203	/* longfelem_scalar sets out = out * scalar */
				204	static void longfelem_scalar(longfelem out, const u64 scalar) {
				205	out[0] *= scalar;
				206	out[1] *= scalar;
				207	out[2] *= scalar;
				208	out[3] *= scalar;
				209	out[4] *= scalar;
				210	out[5] *= scalar;
				211	out[6] *= scalar;
				212	out[7] *= scalar;
				213	}
				214
				215	#define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9)
				216	#define two105 (((limb)1) << 105)
				217	#define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9)
				218
				219	/* zero105 is 0 mod p */
				220	static const felem zero105 = {two105m41m9, two105, two105m41p9, two105m41p9};
				221
				222	/* smallfelem_neg sets \|out\| to \|-small\|
				223	* On exit:
				224	* out[i] < out[i] + 2^105 */
				225	static void smallfelem_neg(felem out, const smallfelem small) {
				226	/* In order to prevent underflow, we subtract from 0 mod p. */
				227	out[0] = zero105[0] - small[0];
				228	out[1] = zero105[1] - small[1];
				229	out[2] = zero105[2] - small[2];
				230	out[3] = zero105[3] - small[3];
				231	}
				232
				233	/* felem_diff subtracts \|in\| from \|out\|
				234	* On entry:
				235	* in[i] < 2^104
				236	* On exit:
				237	* out[i] < out[i] + 2^105. */
				238	static void felem_diff(felem out, const felem in) {
				239	/* In order to prevent underflow, we add 0 mod p before subtracting. */
				240	out[0] += zero105[0];
				241	out[1] += zero105[1];
				242	out[2] += zero105[2];
				243	out[3] += zero105[3];
				244
				245	out[0] -= in[0];
				246	out[1] -= in[1];
				247	out[2] -= in[2];
				248	out[3] -= in[3];
				249	}
				250
				251	#define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11)
				252	#define two107 (((limb)1) << 107)
				253	#define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11)
				254
				255	/* zero107 is 0 mod p */
				256	static const felem zero107 = {two107m43m11, two107, two107m43p11, two107m43p11};
				257
				258	/* An alternative felem_diff for larger inputs \|in\|
				259	* felem_diff_zero107 subtracts \|in\| from \|out\|
				260	* On entry:
				261	* in[i] < 2^106
				262	* On exit:
				263	* out[i] < out[i] + 2^107. */
				264	static void felem_diff_zero107(felem out, const felem in) {
				265	/* In order to prevent underflow, we add 0 mod p before subtracting. */
				266	out[0] += zero107[0];
				267	out[1] += zero107[1];
				268	out[2] += zero107[2];
				269	out[3] += zero107[3];
				270
				271	out[0] -= in[0];
				272	out[1] -= in[1];
				273	out[2] -= in[2];
				274	out[3] -= in[3];
				275	}
				276
				277	/* longfelem_diff subtracts \|in\| from \|out\|
				278	* On entry:
				279	* in[i] < 7*2^67
				280	* On exit:
				281	* out[i] < out[i] + 2^70 + 2^40. */
				282	static void longfelem_diff(longfelem out, const longfelem in) {
				283	static const limb two70m8p6 =
				284	(((limb)1) << 70) - (((limb)1) << 8) + (((limb)1) << 6);
				285	static const limb two70p40 = (((limb)1) << 70) + (((limb)1) << 40);
				286	static const limb two70 = (((limb)1) << 70);
				287	static const limb two70m40m38p6 = (((limb)1) << 70) - (((limb)1) << 40) -
				288	(((limb)1) << 38) + (((limb)1) << 6);
				289	static const limb two70m6 = (((limb)1) << 70) - (((limb)1) << 6);
				290
				291	/* add 0 mod p to avoid underflow */
				292	out[0] += two70m8p6;
				293	out[1] += two70p40;
				294	out[2] += two70;
				295	out[3] += two70m40m38p6;
				296	out[4] += two70m6;
				297	out[5] += two70m6;
				298	out[6] += two70m6;
				299	out[7] += two70m6;
				300
				301	/* in[i] < 72^67 < 2^70 - 2^40 - 2^38 + 2^6 /
				302	out[0] -= in[0];
				303	out[1] -= in[1];
				304	out[2] -= in[2];
				305	out[3] -= in[3];
				306	out[4] -= in[4];
				307	out[5] -= in[5];
				308	out[6] -= in[6];
				309	out[7] -= in[7];
				310	}
				311
				312	#define two64m0 (((limb)1) << 64) - 1
				313	#define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1
				314	#define two64m46 (((limb)1) << 64) - (((limb)1) << 46)
				315	#define two64m32 (((limb)1) << 64) - (((limb)1) << 32)
				316
				317	/* zero110 is 0 mod p. */
				318	static const felem zero110 = {two64m0, two110p32m0, two64m46, two64m32};
				319
				320	/* felem_shrink converts an felem into a smallfelem. The result isn't quite
				321	* minimal as the value may be greater than p.
				322	*
				323	* On entry:
				324	* in[i] < 2^109
				325	* On exit:
				326	* out[i] < 2^64. */
				327	static void felem_shrink(smallfelem out, const felem in) {
				328	felem tmp;
				329	u64 a, b, mask;
				330	s64 high, low;
				331	static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */
				332
				333	/* Carry 2->3 */
				334	tmp[3] = zero110[3] + in[3] + ((u64)(in[2] >> 64));
				335	/* tmp[3] < 2^110 */
				336
				337	tmp[2] = zero110[2] + (u64)in[2];
				338	tmp[0] = zero110[0] + in[0];
				339	tmp[1] = zero110[1] + in[1];
				340	/* tmp[0] < 2110, tmp[1] < 2^111, tmp[2] < 265 */
				341
				342	/* We perform two partial reductions where we eliminate the high-word of
				343	* tmp[3]. We don't update the other words till the end. */
				344	a = tmp[3] >> 64; /* a < 2^46 */
				345	tmp[3] = (u64)tmp[3];
				346	tmp[3] -= a;
				347	tmp[3] += ((limb)a) << 32;
				348	/* tmp[3] < 2^79 */
				349
				350	b = a;
				351	a = tmp[3] >> 64; /* a < 2^15 */
				352	b += a; /* b < 2^46 + 2^15 < 2^47 */
				353	tmp[3] = (u64)tmp[3];
				354	tmp[3] -= a;
				355	tmp[3] += ((limb)a) << 32;
				356	/* tmp[3] < 2^64 + 2^47 */
				357
				358	/* This adjusts the other two words to complete the two partial
				359	* reductions. */
				360	tmp[0] += b;
				361	tmp[1] -= (((limb)b) << 32);
				362
				363	/* In order to make space in tmp[3] for the carry from 2 -> 3, we
				364	* conditionally subtract kPrime if tmp[3] is large enough. */
				365	high = tmp[3] >> 64;
				366	/* As tmp[3] < 2^65, high is either 1 or 0 */
				367	high <<= 63;
				368	high >>= 63;
				369	/* high is:
				370	* all ones if the high word of tmp[3] is 1
				371	* all zeros if the high word of tmp[3] if 0 */
				372	low = tmp[3];
				373	mask = low >> 63;
				374	/* mask is:
				375	* all ones if the MSB of low is 1
				376	* all zeros if the MSB of low if 0 */
				377	low &= bottom63bits;
				378	low -= kPrime3Test;
				379	/* if low was greater than kPrime3Test then the MSB is zero */
				380	low = ~low;
				381	low >>= 63;
				382	/* low is:
				383	* all ones if low was > kPrime3Test
				384	* all zeros if low was <= kPrime3Test */
				385	mask = (mask & low) \| high;
				386	tmp[0] -= mask & kPrime[0];
				387	tmp[1] -= mask & kPrime[1];
				388	/* kPrime[2] is zero, so omitted */
				389	tmp[3] -= mask & kPrime[3];
				390	/* tmp[3] < 264 - 232 + 1 */
				391
				392	tmp[1] += ((u64)(tmp[0] >> 64));
				393	tmp[0] = (u64)tmp[0];
				394	tmp[2] += ((u64)(tmp[1] >> 64));
				395	tmp[1] = (u64)tmp[1];
				396	tmp[3] += ((u64)(tmp[2] >> 64));
				397	tmp[2] = (u64)tmp[2];
				398	/* tmp[i] < 2^64 */
				399
				400	out[0] = tmp[0];
				401	out[1] = tmp[1];
				402	out[2] = tmp[2];
				403	out[3] = tmp[3];
				404	}
				405
				406	/* smallfelem_expand converts a smallfelem to an felem */
				407	static void smallfelem_expand(felem out, const smallfelem in) {
				408	out[0] = in[0];
				409	out[1] = in[1];
				410	out[2] = in[2];
				411	out[3] = in[3];
				412	}
				413
				414	/* smallfelem_square sets \|out\| = \|small\|^2
				415	* On entry:
				416	* small[i] < 2^64
				417	* On exit:
				418	* out[i] < 7 * 2^64 < 2^67 */
				419	static void smallfelem_square(longfelem out, const smallfelem small) {
				420	limb a;
				421	u64 high, low;
				422
				423	a = ((uint128_t)small[0]) * small[0];
				424	low = a;
				425	high = a >> 64;
				426	out[0] = low;
				427	out[1] = high;
				428
				429	a = ((uint128_t)small[0]) * small[1];
				430	low = a;
				431	high = a >> 64;
				432	out[1] += low;
				433	out[1] += low;
				434	out[2] = high;
				435
				436	a = ((uint128_t)small[0]) * small[2];
				437	low = a;
				438	high = a >> 64;
				439	out[2] += low;
				440	out[2] *= 2;
				441	out[3] = high;
				442
				443	a = ((uint128_t)small[0]) * small[3];
				444	low = a;
				445	high = a >> 64;
				446	out[3] += low;
				447	out[4] = high;
				448
				449	a = ((uint128_t)small[1]) * small[2];
				450	low = a;
				451	high = a >> 64;
				452	out[3] += low;
				453	out[3] *= 2;
				454	out[4] += high;
				455
				456	a = ((uint128_t)small[1]) * small[1];
				457	low = a;
				458	high = a >> 64;
				459	out[2] += low;
				460	out[3] += high;
				461
				462	a = ((uint128_t)small[1]) * small[3];
				463	low = a;
				464	high = a >> 64;
				465	out[4] += low;
				466	out[4] *= 2;
				467	out[5] = high;
				468
				469	a = ((uint128_t)small[2]) * small[3];
				470	low = a;
				471	high = a >> 64;
				472	out[5] += low;
				473	out[5] *= 2;
				474	out[6] = high;
				475	out[6] += high;
				476
				477	a = ((uint128_t)small[2]) * small[2];
				478	low = a;
				479	high = a >> 64;
				480	out[4] += low;
				481	out[5] += high;
				482
				483	a = ((uint128_t)small[3]) * small[3];
				484	low = a;
				485	high = a >> 64;
				486	out[6] += low;
				487	out[7] = high;
				488	}
				489
				490	/*felem_square sets \|out\| = \|in\|^2
				491	* On entry:
				492	* in[i] < 2^109
				493	* On exit:
				494	* out[i] < 7 * 2^64 < 2^67. */
				495	static void felem_square(longfelem out, const felem in) {
				496	u64 small[4];
				497	felem_shrink(small, in);
				498	smallfelem_square(out, small);
				499	}
				500
				501	/* smallfelem_mul sets \|out\| = \|small1\| * \|small2\|
				502	* On entry:
				503	* small1[i] < 2^64
				504	* small2[i] < 2^64
				505	* On exit:
				506	* out[i] < 7 * 2^64 < 2^67. */
				507	static void smallfelem_mul(longfelem out, const smallfelem small1,
				508	const smallfelem small2) {
				509	limb a;
				510	u64 high, low;
				511
				512	a = ((uint128_t)small1[0]) * small2[0];
				513	low = a;
				514	high = a >> 64;
				515	out[0] = low;
				516	out[1] = high;
				517
				518	a = ((uint128_t)small1[0]) * small2[1];
				519	low = a;
				520	high = a >> 64;
				521	out[1] += low;
				522	out[2] = high;
				523
				524	a = ((uint128_t)small1[1]) * small2[0];
				525	low = a;
				526	high = a >> 64;
				527	out[1] += low;
				528	out[2] += high;
				529
				530	a = ((uint128_t)small1[0]) * small2[2];
				531	low = a;
				532	high = a >> 64;
				533	out[2] += low;
				534	out[3] = high;
				535
				536	a = ((uint128_t)small1[1]) * small2[1];
				537	low = a;
				538	high = a >> 64;
				539	out[2] += low;
				540	out[3] += high;
				541
				542	a = ((uint128_t)small1[2]) * small2[0];
				543	low = a;
				544	high = a >> 64;
				545	out[2] += low;
				546	out[3] += high;
				547
				548	a = ((uint128_t)small1[0]) * small2[3];
				549	low = a;
				550	high = a >> 64;
				551	out[3] += low;
				552	out[4] = high;
				553
				554	a = ((uint128_t)small1[1]) * small2[2];
				555	low = a;
				556	high = a >> 64;
				557	out[3] += low;
				558	out[4] += high;
				559
				560	a = ((uint128_t)small1[2]) * small2[1];
				561	low = a;
				562	high = a >> 64;
				563	out[3] += low;
				564	out[4] += high;
				565
				566	a = ((uint128_t)small1[3]) * small2[0];
				567	low = a;
				568	high = a >> 64;
				569	out[3] += low;
				570	out[4] += high;
				571
				572	a = ((uint128_t)small1[1]) * small2[3];
				573	low = a;
				574	high = a >> 64;
				575	out[4] += low;
				576	out[5] = high;
				577
				578	a = ((uint128_t)small1[2]) * small2[2];
				579	low = a;
				580	high = a >> 64;
				581	out[4] += low;
				582	out[5] += high;
				583
				584	a = ((uint128_t)small1[3]) * small2[1];
				585	low = a;
				586	high = a >> 64;
				587	out[4] += low;
				588	out[5] += high;
				589
				590	a = ((uint128_t)small1[2]) * small2[3];
				591	low = a;
				592	high = a >> 64;
				593	out[5] += low;
				594	out[6] = high;
				595
				596	a = ((uint128_t)small1[3]) * small2[2];
				597	low = a;
				598	high = a >> 64;
				599	out[5] += low;
				600	out[6] += high;
				601
				602	a = ((uint128_t)small1[3]) * small2[3];
				603	low = a;
				604	high = a >> 64;
				605	out[6] += low;
				606	out[7] = high;
				607	}
				608
				609	/* felem_mul sets \|out\| = \|in1\| * \|in2\|
				610	* On entry:
				611	* in1[i] < 2^109
				612	* in2[i] < 2^109
				613	* On exit:
				614	* out[i] < 7 * 2^64 < 2^67 */
				615	static void felem_mul(longfelem out, const felem in1, const felem in2) {
				616	smallfelem small1, small2;
				617	felem_shrink(small1, in1);
				618	felem_shrink(small2, in2);
				619	smallfelem_mul(out, small1, small2);
				620	}
				621
				622	/* felem_small_mul sets \|out\| = \|small1\| * \|in2\|
				623	* On entry:
				624	* small1[i] < 2^64
				625	* in2[i] < 2^109
				626	* On exit:
				627	* out[i] < 7 * 2^64 < 2^67 */
				628	static void felem_small_mul(longfelem out, const smallfelem small1,
				629	const felem in2) {
				630	smallfelem small2;
				631	felem_shrink(small2, in2);
				632	smallfelem_mul(out, small1, small2);
				633	}
				634
				635	#define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4)
				636	#define two100 (((limb)1) << 100)
				637	#define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4)
				638
				639	/* zero100 is 0 mod p */
				640	static const felem zero100 = {two100m36m4, two100, two100m36p4, two100m36p4};
				641
				642	/* Internal function for the different flavours of felem_reduce.
				643	* felem_reduce_ reduces the higher coefficients in[4]-in[7].
				644	* On entry:
				645	* out[0] >= in[6] + 2^32in[6] + in[7] + 2^32in[7]
				646	* out[1] >= in[7] + 2^32*in[4]
				647	* out[2] >= in[5] + 2^32*in[5]
				648	* out[3] >= in[4] + 2^32in[5] + 2^32in[6]
				649	* On exit:
				650	* out[0] <= out[0] + in[4] + 2^32*in[5]
				651	* out[1] <= out[1] + in[5] + 2^33*in[6]
				652	* out[2] <= out[2] + in[7] + 2in[6] + 2^33in[7]
				653	* out[3] <= out[3] + 2^32in[4] + 3in[7] */
				654	static void felem_reduce_(felem out, const longfelem in) {
				655	int128_t c;
				656	/* combine common terms from below */
				657	c = in[4] + (in[5] << 32);
				658	out[0] += c;
				659	out[3] -= c;
				660
				661	c = in[5] - in[7];
				662	out[1] += c;
				663	out[2] -= c;
				664
				665	/* the remaining terms */
				666	/* 256: [(0,1),(96,-1),(192,-1),(224,1)] */
				667	out[1] -= (in[4] << 32);
				668	out[3] += (in[4] << 32);
				669
				670	/* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */
				671	out[2] -= (in[5] << 32);
				672
				673	/* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */
				674	out[0] -= in[6];
				675	out[0] -= (in[6] << 32);
				676	out[1] += (in[6] << 33);
				677	out[2] += (in[6] * 2);
				678	out[3] -= (in[6] << 32);
				679
				680	/* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */
				681	out[0] -= in[7];
				682	out[0] -= (in[7] << 32);
				683	out[2] += (in[7] << 33);
				684	out[3] += (in[7] * 3);
				685	}
				686
				687	/* felem_reduce converts a longfelem into an felem.
				688	* To be called directly after felem_square or felem_mul.
				689	* On entry:
				690	* in[0] < 2^64, in[1] < 32^64, in[2] < 52^64, in[3] < 7*2^64
				691	* in[4] < 72^64, in[5] < 52^64, in[6] < 32^64, in[7] < 264
				692	* On exit:
				693	* out[i] < 2^101 */
				694	static void felem_reduce(felem out, const longfelem in) {
				695	out[0] = zero100[0] + in[0];
				696	out[1] = zero100[1] + in[1];
				697	out[2] = zero100[2] + in[2];
				698	out[3] = zero100[3] + in[3];
				699
				700	felem_reduce_(out, in);
				701
				702	/* out[0] > 2^100 - 2^36 - 2^4 - 32^64 - 32^96 - 2^64 - 2^96 > 0
				703	* out[1] > 2^100 - 2^64 - 7*2^96 > 0
				704	* out[2] > 2^100 - 2^36 + 2^4 - 52^64 - 52^96 > 0
				705	* out[3] > 2^100 - 2^36 + 2^4 - 72^64 - 52^96 - 3*2^96 > 0
				706	*
				707	* out[0] < 2^100 + 2^64 + 72^64 + 52^96 < 2^101
				708	* out[1] < 2^100 + 32^64 + 52^64 + 3*2^97 < 2^101
				709	* out[2] < 2^100 + 52^64 + 2^64 + 32^65 + 2^97 < 2^101
				710	* out[3] < 2^100 + 72^64 + 72^96 + 32^64 < 2^101 /
				711	}
				712
				713	/* felem_reduce_zero105 converts a larger longfelem into an felem.
				714	* On entry:
				715	* in[0] < 2^71
				716	* On exit:
				717	* out[i] < 2^106 */
				718	static void felem_reduce_zero105(felem out, const longfelem in) {
				719	out[0] = zero105[0] + in[0];
				720	out[1] = zero105[1] + in[1];
				721	out[2] = zero105[2] + in[2];
				722	out[3] = zero105[3] + in[3];
				723
				724	felem_reduce_(out, in);
				725
				726	/* out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0
				727	* out[1] > 2^105 - 2^71 - 2^103 > 0
				728	* out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0
				729	* out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0
				730	*
				731	* out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
				732	* out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
				733	* out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106
				734	* out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106 */
				735	}
				736
				737	/* subtract_u64 sets result = result - v and *carry to one if the
				738	* subtraction underflowed. */
				739	static void subtract_u64(u64 result, u64 carry, u64 v) {
				740	uint128_t r = *result;
				741	r -= v;
				742	*carry = (r >> 64) & 1;
				743	*result = (u64)r;
				744	}
				745
				746	/* felem_contract converts \|in\| to its unique, minimal representation. On
				747	* entry: in[i] < 2^109. */
				748	static void felem_contract(smallfelem out, const felem in) {
				749	u64 all_equal_so_far = 0, result = 0;
				750
				751	felem_shrink(out, in);
				752	/* small is minimal except that the value might be > p */
				753
				754	all_equal_so_far--;
				755	/* We are doing a constant time test if out >= kPrime. We need to compare
				756	* each u64, from most-significant to least significant. For each one, if
				757	* all words so far have been equal (m is all ones) then a non-equal
				758	* result is the answer. Otherwise we continue. */
				759	unsigned i;
				760	for (i = 3; i < 4; i--) {
				761	u64 equal;
				762	uint128_t a = ((uint128_t)kPrime[i]) - out[i];
				763	/* if out[i] > kPrime[i] then a will underflow and the high 64-bits
				764	* will all be set. */
				765	result \|= all_equal_so_far & ((u64)(a >> 64));
				766
				767	/* if kPrime[i] == out[i] then \|equal\| will be all zeros and the
				768	* decrement will make it all ones. */
				769	equal = kPrime[i] ^ out[i];
				770	equal--;
				771	equal &= equal << 32;
				772	equal &= equal << 16;
				773	equal &= equal << 8;
				774	equal &= equal << 4;
				775	equal &= equal << 2;
				776	equal &= equal << 1;
				777	equal = ((s64)equal) >> 63;
				778
				779	all_equal_so_far &= equal;
				780	}
				781
				782	/* if all_equal_so_far is still all ones then the two values are equal
				783	* and so out >= kPrime is true. */
				784	result \|= all_equal_so_far;
				785
				786	/* if out >= kPrime then we subtract kPrime. */
				787	u64 carry;
				788	subtract_u64(&out[0], &carry, result & kPrime[0]);
				789	subtract_u64(&out[1], &carry, carry);
				790	subtract_u64(&out[2], &carry, carry);
				791	subtract_u64(&out[3], &carry, carry);
				792
				793	subtract_u64(&out[1], &carry, result & kPrime[1]);
				794	subtract_u64(&out[2], &carry, carry);
				795	subtract_u64(&out[3], &carry, carry);
				796
				797	subtract_u64(&out[2], &carry, result & kPrime[2]);
				798	subtract_u64(&out[3], &carry, carry);
				799
				800	subtract_u64(&out[3], &carry, result & kPrime[3]);
				801	}
				802
				803	static void smallfelem_square_contract(smallfelem out, const smallfelem in) {
				804	longfelem longtmp;
				805	felem tmp;
				806
				807	smallfelem_square(longtmp, in);
				808	felem_reduce(tmp, longtmp);
				809	felem_contract(out, tmp);
				810	}
				811
				812	static void smallfelem_mul_contract(smallfelem out, const smallfelem in1,
				813	const smallfelem in2) {
				814	longfelem longtmp;
				815	felem tmp;
				816
				817	smallfelem_mul(longtmp, in1, in2);
				818	felem_reduce(tmp, longtmp);
				819	felem_contract(out, tmp);
				820	}
				821
				822	/* felem_is_zero returns a limb with all bits set if \|in\| == 0 (mod p) and 0
				823	* otherwise.
				824	* On entry:
				825	* small[i] < 2^64 */
				826	static limb smallfelem_is_zero(const smallfelem small) {
				827	limb result;
				828	u64 is_p;
				829
				830	u64 is_zero = small[0] \| small[1] \| small[2] \| small[3];
				831	is_zero--;
				832	is_zero &= is_zero << 32;
				833	is_zero &= is_zero << 16;
				834	is_zero &= is_zero << 8;
				835	is_zero &= is_zero << 4;
				836	is_zero &= is_zero << 2;
				837	is_zero &= is_zero << 1;
				838	is_zero = ((s64)is_zero) >> 63;
				839
				840	is_p = (small[0] ^ kPrime[0]) \| (small[1] ^ kPrime[1]) \|
				841	(small[2] ^ kPrime[2]) \| (small[3] ^ kPrime[3]);
				842	is_p--;
				843	is_p &= is_p << 32;
				844	is_p &= is_p << 16;
				845	is_p &= is_p << 8;
				846	is_p &= is_p << 4;
				847	is_p &= is_p << 2;
				848	is_p &= is_p << 1;
				849	is_p = ((s64)is_p) >> 63;
				850
				851	is_zero \|= is_p;
				852
				853	result = is_zero;
				854	result \|= ((limb)is_zero) << 64;
				855	return result;
				856	}
				857
				858	static int smallfelem_is_zero_int(const smallfelem small) {
				859	return (int)(smallfelem_is_zero(small) & ((limb)1));
				860	}
				861
				862	/* felem_inv calculates \|out\| = \|in\|^{-1}
				863	*
				864	* Based on Fermat's Little Theorem:
				865	* a^p = a (mod p)
				866	* a^{p-1} = 1 (mod p)
				867	* a^{p-2} = a^{-1} (mod p) */
				868	static void felem_inv(felem out, const felem in) {
				869	felem ftmp, ftmp2;
				870	/* each e_I will hold \|in\|^{2^I - 1} */
				871	felem e2, e4, e8, e16, e32, e64;
				872	longfelem tmp;
				873	unsigned i;
				874
				875	felem_square(tmp, in);
				876	felem_reduce(ftmp, tmp); /* 2^1 */
				877	felem_mul(tmp, in, ftmp);
				878	felem_reduce(ftmp, tmp); /* 2^2 - 2^0 */
				879	felem_assign(e2, ftmp);
				880	felem_square(tmp, ftmp);
				881	felem_reduce(ftmp, tmp); /* 2^3 - 2^1 */
				882	felem_square(tmp, ftmp);
				883	felem_reduce(ftmp, tmp); /* 2^4 - 2^2 */
				884	felem_mul(tmp, ftmp, e2);
				885	felem_reduce(ftmp, tmp); /* 2^4 - 2^0 */
				886	felem_assign(e4, ftmp);
				887	felem_square(tmp, ftmp);
				888	felem_reduce(ftmp, tmp); /* 2^5 - 2^1 */
				889	felem_square(tmp, ftmp);
				890	felem_reduce(ftmp, tmp); /* 2^6 - 2^2 */
				891	felem_square(tmp, ftmp);
				892	felem_reduce(ftmp, tmp); /* 2^7 - 2^3 */
				893	felem_square(tmp, ftmp);
				894	felem_reduce(ftmp, tmp); /* 2^8 - 2^4 */
				895	felem_mul(tmp, ftmp, e4);
				896	felem_reduce(ftmp, tmp); /* 2^8 - 2^0 */
				897	felem_assign(e8, ftmp);
				898	for (i = 0; i < 8; i++) {
				899	felem_square(tmp, ftmp);
				900	felem_reduce(ftmp, tmp);
				901	} /* 2^16 - 2^8 */
				902	felem_mul(tmp, ftmp, e8);
				903	felem_reduce(ftmp, tmp); /* 2^16 - 2^0 */
				904	felem_assign(e16, ftmp);
				905	for (i = 0; i < 16; i++) {
				906	felem_square(tmp, ftmp);
				907	felem_reduce(ftmp, tmp);
				908	} /* 2^32 - 2^16 */
				909	felem_mul(tmp, ftmp, e16);
				910	felem_reduce(ftmp, tmp); /* 2^32 - 2^0 */
				911	felem_assign(e32, ftmp);
				912	for (i = 0; i < 32; i++) {
				913	felem_square(tmp, ftmp);
				914	felem_reduce(ftmp, tmp);
				915	} /* 2^64 - 2^32 */
				916	felem_assign(e64, ftmp);
				917	felem_mul(tmp, ftmp, in);
				918	felem_reduce(ftmp, tmp); /* 2^64 - 2^32 + 2^0 */
				919	for (i = 0; i < 192; i++) {
				920	felem_square(tmp, ftmp);
				921	felem_reduce(ftmp, tmp);
				922	} /* 2^256 - 2^224 + 2^192 */
				923
				924	felem_mul(tmp, e64, e32);
				925	felem_reduce(ftmp2, tmp); /* 2^64 - 2^0 */
				926	for (i = 0; i < 16; i++) {
				927	felem_square(tmp, ftmp2);
				928	felem_reduce(ftmp2, tmp);
				929	} /* 2^80 - 2^16 */
				930	felem_mul(tmp, ftmp2, e16);
				931	felem_reduce(ftmp2, tmp); /* 2^80 - 2^0 */
				932	for (i = 0; i < 8; i++) {
				933	felem_square(tmp, ftmp2);
				934	felem_reduce(ftmp2, tmp);
				935	} /* 2^88 - 2^8 */
				936	felem_mul(tmp, ftmp2, e8);
				937	felem_reduce(ftmp2, tmp); /* 2^88 - 2^0 */
				938	for (i = 0; i < 4; i++) {
				939	felem_square(tmp, ftmp2);
				940	felem_reduce(ftmp2, tmp);
				941	} /* 2^92 - 2^4 */
				942	felem_mul(tmp, ftmp2, e4);
				943	felem_reduce(ftmp2, tmp); /* 2^92 - 2^0 */
				944	felem_square(tmp, ftmp2);
				945	felem_reduce(ftmp2, tmp); /* 2^93 - 2^1 */
				946	felem_square(tmp, ftmp2);
				947	felem_reduce(ftmp2, tmp); /* 2^94 - 2^2 */
				948	felem_mul(tmp, ftmp2, e2);
				949	felem_reduce(ftmp2, tmp); /* 2^94 - 2^0 */
				950	felem_square(tmp, ftmp2);
				951	felem_reduce(ftmp2, tmp); /* 2^95 - 2^1 */
				952	felem_square(tmp, ftmp2);
				953	felem_reduce(ftmp2, tmp); /* 2^96 - 2^2 */
				954	felem_mul(tmp, ftmp2, in);
				955	felem_reduce(ftmp2, tmp); /* 2^96 - 3 */
				956
				957	felem_mul(tmp, ftmp2, ftmp);
				958	felem_reduce(out, tmp); /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
				959	}
				960
				961	static void smallfelem_inv_contract(smallfelem out, const smallfelem in) {
				962	felem tmp;
				963
				964	smallfelem_expand(tmp, in);
				965	felem_inv(tmp, tmp);
				966	felem_contract(out, tmp);
				967	}
				968
				969	/* Group operations
				970	* ----------------
				971	*
				972	* Building on top of the field operations we have the operations on the
				973	* elliptic curve group itself. Points on the curve are represented in Jacobian
				974	* coordinates. */
				975
				976	/* point_double calculates 2*(x_in, y_in, z_in)
				977	*
				978	* The method is taken from:
				979	* http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
				980	*
				981	* Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
				982	* while x_out == y_in is not (maybe this works, but it's not tested). */
				983	static void point_double(felem x_out, felem y_out, felem z_out,
				984	const felem x_in, const felem y_in, const felem z_in) {
				985	longfelem tmp, tmp2;
				986	felem delta, gamma, beta, alpha, ftmp, ftmp2;
				987	smallfelem small1, small2;
				988
				989	felem_assign(ftmp, x_in);
				990	/* ftmp[i] < 2^106 */
				991	felem_assign(ftmp2, x_in);
				992	/* ftmp2[i] < 2^106 */
				993
				994	/* delta = z^2 */
				995	felem_square(tmp, z_in);
				996	felem_reduce(delta, tmp);
				997	/* delta[i] < 2^101 */
				998
				999	/* gamma = y^2 */
				1000	felem_square(tmp, y_in);
				1001	felem_reduce(gamma, tmp);
				1002	/* gamma[i] < 2^101 */
				1003	felem_shrink(small1, gamma);
				1004
				1005	/* beta = xgamma /
				1006	felem_small_mul(tmp, small1, x_in);
				1007	felem_reduce(beta, tmp);
				1008	/* beta[i] < 2^101 */
				1009
				1010	/* alpha = 3(x-delta)(x+delta) */
				1011	felem_diff(ftmp, delta);
				1012	/* ftmp[i] < 2^105 + 2^106 < 2^107 */
				1013	felem_sum(ftmp2, delta);
				1014	/* ftmp2[i] < 2^105 + 2^106 < 2^107 */
				1015	felem_scalar(ftmp2, 3);
				1016	/* ftmp2[i] < 3 * 2^107 < 2^109 */
				1017	felem_mul(tmp, ftmp, ftmp2);
				1018	felem_reduce(alpha, tmp);
				1019	/* alpha[i] < 2^101 */
				1020	felem_shrink(small2, alpha);
				1021
				1022	/* x' = alpha^2 - 8beta /
				1023	smallfelem_square(tmp, small2);
				1024	felem_reduce(x_out, tmp);
				1025	felem_assign(ftmp, beta);
				1026	felem_scalar(ftmp, 8);
				1027	/* ftmp[i] < 8 * 2^101 = 2^104 */
				1028	felem_diff(x_out, ftmp);
				1029	/* x_out[i] < 2^105 + 2^101 < 2^106 */
				1030
				1031	/* z' = (y + z)^2 - gamma - delta */
				1032	felem_sum(delta, gamma);
				1033	/* delta[i] < 2^101 + 2^101 = 2^102 */
				1034	felem_assign(ftmp, y_in);
				1035	felem_sum(ftmp, z_in);
				1036	/* ftmp[i] < 2^106 + 2^106 = 2^107 */
				1037	felem_square(tmp, ftmp);
				1038	felem_reduce(z_out, tmp);
				1039	felem_diff(z_out, delta);
				1040	/* z_out[i] < 2^105 + 2^101 < 2^106 */
				1041
				1042	/* y' = alpha(4beta - x') - 8gamma^2 /
				1043	felem_scalar(beta, 4);
				1044	/* beta[i] < 4 * 2^101 = 2^103 */
				1045	felem_diff_zero107(beta, x_out);
				1046	/* beta[i] < 2^107 + 2^103 < 2^108 */
				1047	felem_small_mul(tmp, small2, beta);
				1048	/* tmp[i] < 7 * 2^64 < 2^67 */
				1049	smallfelem_square(tmp2, small1);
				1050	/* tmp2[i] < 7 * 2^64 */
				1051	longfelem_scalar(tmp2, 8);
				1052	/* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */
				1053	longfelem_diff(tmp, tmp2);
				1054	/* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
				1055	felem_reduce_zero105(y_out, tmp);
				1056	/* y_out[i] < 2^106 */
				1057	}
				1058
				1059	/* point_double_small is the same as point_double, except that it operates on
				1060	* smallfelems. */
				1061	static void point_double_small(smallfelem x_out, smallfelem y_out,
				1062	smallfelem z_out, const smallfelem x_in,
				1063	const smallfelem y_in, const smallfelem z_in) {
				1064	felem felem_x_out, felem_y_out, felem_z_out;
				1065	felem felem_x_in, felem_y_in, felem_z_in;
				1066
				1067	smallfelem_expand(felem_x_in, x_in);
				1068	smallfelem_expand(felem_y_in, y_in);
				1069	smallfelem_expand(felem_z_in, z_in);
				1070	point_double(felem_x_out, felem_y_out, felem_z_out, felem_x_in, felem_y_in,
				1071	felem_z_in);
				1072	felem_shrink(x_out, felem_x_out);
				1073	felem_shrink(y_out, felem_y_out);
				1074	felem_shrink(z_out, felem_z_out);
				1075	}
				1076
				1077	/* copy_conditional copies in to out iff mask is all ones. */
				1078	static void copy_conditional(felem out, const felem in, limb mask) {
				1079	unsigned i;
				1080	for (i = 0; i < NLIMBS; ++i) {
				1081	const limb tmp = mask & (in[i] ^ out[i]);
				1082	out[i] ^= tmp;
				1083	}
				1084	}
				1085
				1086	/* copy_small_conditional copies in to out iff mask is all ones. */
				1087	static void copy_small_conditional(felem out, const smallfelem in, limb mask) {
				1088	unsigned i;
				1089	const u64 mask64 = mask;
				1090	for (i = 0; i < NLIMBS; ++i) {
				1091	out[i] = ((limb)(in[i] & mask64)) \| (out[i] & ~mask);
				1092	}
				1093	}
				1094
				1095	/* point_add calcuates (x1, y1, z1) + (x2, y2, z2)
				1096	*
				1097	* The method is taken from:
				1098	* http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
				1099	* adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
				1100	*
				1101	* This function includes a branch for checking whether the two input points
				1102	* are equal, (while not equal to the point at infinity). This case never
				1103	* happens during single point multiplication, so there is no timing leak for
				1104	* ECDH or ECDSA signing. */
				1105	static void point_add(felem x3, felem y3, felem z3, const felem x1,
				1106	const felem y1, const felem z1, const int mixed,
				1107	const smallfelem x2, const smallfelem y2,
				1108	const smallfelem z2) {
				1109	felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
				1110	longfelem tmp, tmp2;
				1111	smallfelem small1, small2, small3, small4, small5;
				1112	limb x_equal, y_equal, z1_is_zero, z2_is_zero;
				1113
				1114	felem_shrink(small3, z1);
				1115
				1116	z1_is_zero = smallfelem_is_zero(small3);
				1117	z2_is_zero = smallfelem_is_zero(z2);
				1118
				1119	/* ftmp = z1z1 = z1*2 /
				1120	smallfelem_square(tmp, small3);
				1121	felem_reduce(ftmp, tmp);
				1122	/* ftmp[i] < 2^101 */
				1123	felem_shrink(small1, ftmp);
				1124
				1125	if (!mixed) {
				1126	/* ftmp2 = z2z2 = z2*2 /
				1127	smallfelem_square(tmp, z2);
				1128	felem_reduce(ftmp2, tmp);
				1129	/* ftmp2[i] < 2^101 */
				1130	felem_shrink(small2, ftmp2);
				1131
				1132	felem_shrink(small5, x1);
				1133
				1134	/* u1 = ftmp3 = x1z2z2 /
				1135	smallfelem_mul(tmp, small5, small2);
				1136	felem_reduce(ftmp3, tmp);
				1137	/* ftmp3[i] < 2^101 */
				1138
				1139	/* ftmp5 = z1 + z2 */
				1140	felem_assign(ftmp5, z1);
				1141	felem_small_sum(ftmp5, z2);
				1142	/* ftmp5[i] < 2^107 */
				1143
				1144	/* ftmp5 = (z1 + z2)*2 - (z1z1 + z2z2) = 2z1z2 /
				1145	felem_square(tmp, ftmp5);
				1146	felem_reduce(ftmp5, tmp);
				1147	/* ftmp2 = z2z2 + z1z1 */
				1148	felem_sum(ftmp2, ftmp);
				1149	/* ftmp2[i] < 2^101 + 2^101 = 2^102 */
				1150	felem_diff(ftmp5, ftmp2);
				1151	/* ftmp5[i] < 2^105 + 2^101 < 2^106 */
				1152
				1153	/* ftmp2 = z2 * z2z2 */
				1154	smallfelem_mul(tmp, small2, z2);
				1155	felem_reduce(ftmp2, tmp);
				1156
				1157	/* s1 = ftmp2 = y1 * z2*3 /
				1158	felem_mul(tmp, y1, ftmp2);
				1159	felem_reduce(ftmp6, tmp);
				1160	/* ftmp6[i] < 2^101 */
				1161	} else {
				1162	/* We'll assume z2 = 1 (special case z2 = 0 is handled later). */
				1163
				1164	/* u1 = ftmp3 = x1z2z2 /
				1165	felem_assign(ftmp3, x1);
				1166	/* ftmp3[i] < 2^106 */
				1167
				1168	/* ftmp5 = 2z1z2 */
				1169	felem_assign(ftmp5, z1);
				1170	felem_scalar(ftmp5, 2);
				1171	/* ftmp5[i] < 22^106 = 2^107 /
				1172
				1173	/* s1 = ftmp2 = y1 * z2*3 /
				1174	felem_assign(ftmp6, y1);
				1175	/* ftmp6[i] < 2^106 */
				1176	}
				1177
				1178	/* u2 = x2z1z1 /
				1179	smallfelem_mul(tmp, x2, small1);
				1180	felem_reduce(ftmp4, tmp);
				1181
				1182	/* h = ftmp4 = u2 - u1 */
				1183	felem_diff_zero107(ftmp4, ftmp3);
				1184	/* ftmp4[i] < 2^107 + 2^101 < 2^108 */
				1185	felem_shrink(small4, ftmp4);
				1186
				1187	x_equal = smallfelem_is_zero(small4);
				1188
				1189	/* z_out = ftmp5 * h */
				1190	felem_small_mul(tmp, small4, ftmp5);
				1191	felem_reduce(z_out, tmp);
				1192	/* z_out[i] < 2^101 */
				1193
				1194	/* ftmp = z1 * z1z1 */
				1195	smallfelem_mul(tmp, small1, small3);
				1196	felem_reduce(ftmp, tmp);
				1197
				1198	/* s2 = tmp = y2 * z1*3 /
				1199	felem_small_mul(tmp, y2, ftmp);
				1200	felem_reduce(ftmp5, tmp);
				1201
				1202	/* r = ftmp5 = (s2 - s1)2 /
				1203	felem_diff_zero107(ftmp5, ftmp6);
				1204	/* ftmp5[i] < 2^107 + 2^107 = 2^108 */
				1205	felem_scalar(ftmp5, 2);
				1206	/* ftmp5[i] < 2^109 */
				1207	felem_shrink(small1, ftmp5);
				1208	y_equal = smallfelem_is_zero(small1);
				1209
				1210	if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
				1211	point_double(x3, y3, z3, x1, y1, z1);
				1212	return;
				1213	}
				1214
				1215	/* I = ftmp = (2h)*2 /
				1216	felem_assign(ftmp, ftmp4);
				1217	felem_scalar(ftmp, 2);
				1218	/* ftmp[i] < 22^108 = 2^109 /
				1219	felem_square(tmp, ftmp);
				1220	felem_reduce(ftmp, tmp);
				1221
				1222	/* J = ftmp2 = h * I */
				1223	felem_mul(tmp, ftmp4, ftmp);
				1224	felem_reduce(ftmp2, tmp);
				1225
				1226	/* V = ftmp4 = U1 * I */
				1227	felem_mul(tmp, ftmp3, ftmp);
				1228	felem_reduce(ftmp4, tmp);
				1229
				1230	/* x_out = r*2 - J - 2V /
				1231	smallfelem_square(tmp, small1);
				1232	felem_reduce(x_out, tmp);
				1233	felem_assign(ftmp3, ftmp4);
				1234	felem_scalar(ftmp4, 2);
				1235	felem_sum(ftmp4, ftmp2);
				1236	/* ftmp4[i] < 22^101 + 2^101 < 2^103 /
				1237	felem_diff(x_out, ftmp4);
				1238	/* x_out[i] < 2^105 + 2^101 */
				1239
				1240	/* y_out = r(V-x_out) - 2 * s1 * J */
				1241	felem_diff_zero107(ftmp3, x_out);
				1242	/* ftmp3[i] < 2^107 + 2^101 < 2^108 */
				1243	felem_small_mul(tmp, small1, ftmp3);
				1244	felem_mul(tmp2, ftmp6, ftmp2);
				1245	longfelem_scalar(tmp2, 2);
				1246	/* tmp2[i] < 22^67 = 2^68 /
				1247	longfelem_diff(tmp, tmp2);
				1248	/* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
				1249	felem_reduce_zero105(y_out, tmp);
				1250	/* y_out[i] < 2^106 */
				1251
				1252	copy_small_conditional(x_out, x2, z1_is_zero);
				1253	copy_conditional(x_out, x1, z2_is_zero);
				1254	copy_small_conditional(y_out, y2, z1_is_zero);
				1255	copy_conditional(y_out, y1, z2_is_zero);
				1256	copy_small_conditional(z_out, z2, z1_is_zero);
				1257	copy_conditional(z_out, z1, z2_is_zero);
				1258	felem_assign(x3, x_out);
				1259	felem_assign(y3, y_out);
				1260	felem_assign(z3, z_out);
				1261	}
				1262
				1263	/* point_add_small is the same as point_add, except that it operates on
				1264	* smallfelems. */
				1265	static void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3,
				1266	smallfelem x1, smallfelem y1, smallfelem z1,
				1267	smallfelem x2, smallfelem y2, smallfelem z2) {
				1268	felem felem_x3, felem_y3, felem_z3;
				1269	felem felem_x1, felem_y1, felem_z1;
				1270	smallfelem_expand(felem_x1, x1);
				1271	smallfelem_expand(felem_y1, y1);
				1272	smallfelem_expand(felem_z1, z1);
				1273	point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0, x2,
				1274	y2, z2);
				1275	felem_shrink(x3, felem_x3);
				1276	felem_shrink(y3, felem_y3);
				1277	felem_shrink(z3, felem_z3);
				1278	}
				1279
				1280	/* Base point pre computation
				1281	* --------------------------
				1282	*
				1283	* Two different sorts of precomputed tables are used in the following code.
				1284	* Each contain various points on the curve, where each point is three field
				1285	* elements (x, y, z).
				1286	*
				1287	* For the base point table, z is usually 1 (0 for the point at infinity).
				1288	* This table has 2 * 16 elements, starting with the following:
				1289	* index \| bits \| point
				1290	* ------+---------+------------------------------
				1291	* 0 \| 0 0 0 0 \| 0G
				1292	* 1 \| 0 0 0 1 \| 1G
				1293	* 2 \| 0 0 1 0 \| 2^64G
				1294	* 3 \| 0 0 1 1 \| (2^64 + 1)G
				1295	* 4 \| 0 1 0 0 \| 2^128G
				1296	* 5 \| 0 1 0 1 \| (2^128 + 1)G
				1297	* 6 \| 0 1 1 0 \| (2^128 + 2^64)G
				1298	* 7 \| 0 1 1 1 \| (2^128 + 2^64 + 1)G
				1299	* 8 \| 1 0 0 0 \| 2^192G
				1300	* 9 \| 1 0 0 1 \| (2^192 + 1)G
				1301	* 10 \| 1 0 1 0 \| (2^192 + 2^64)G
				1302	* 11 \| 1 0 1 1 \| (2^192 + 2^64 + 1)G
				1303	* 12 \| 1 1 0 0 \| (2^192 + 2^128)G
				1304	* 13 \| 1 1 0 1 \| (2^192 + 2^128 + 1)G
				1305	* 14 \| 1 1 1 0 \| (2^192 + 2^128 + 2^64)G
				1306	* 15 \| 1 1 1 1 \| (2^192 + 2^128 + 2^64 + 1)G
				1307	* followed by a copy of this with each element multiplied by 2^32.
				1308	*
				1309	* The reason for this is so that we can clock bits into four different
				1310	* locations when doing simple scalar multiplies against the base point,
				1311	* and then another four locations using the second 16 elements.
				1312	*
				1313	* Tables for other points have table[i] = iG for i in 0 .. 16. */
				1314
				1315	/* gmul is the table of precomputed base points */
				1316	static const smallfelem gmul[2][16][3] = {
				1317	{{{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}},
				1318	{{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2,
				1319	0x6b17d1f2e12c4247},
				1320	{0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16,
				1321	0x4fe342e2fe1a7f9b},
				1322	{1, 0, 0, 0}},
				1323	{{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de,
				1324	0x0fa822bc2811aaa5},
				1325	{0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b,
				1326	0xbff44ae8f5dba80d},
				1327	{1, 0, 0, 0}},
				1328	{{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789,
				1329	0x300a4bbc89d6726f},
				1330	{0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f,
				1331	0x72aac7e0d09b4644},
				1332	{1, 0, 0, 0}},
				1333	{{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e,
				1334	0x447d739beedb5e67},
				1335	{0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7,
				1336	0x2d4825ab834131ee},
				1337	{1, 0, 0, 0}},
				1338	{{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60,
				1339	0xef9519328a9c72ff},
				1340	{0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c,
				1341	0x611e9fc37dbb2c9b},
				1342	{1, 0, 0, 0}},
				1343	{{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf,
				1344	0x550663797b51f5d8},
				1345	{0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5,
				1346	0x157164848aecb851},
				1347	{1, 0, 0, 0}},
				1348	{{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391,
				1349	0xeb5d7745b21141ea},
				1350	{0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee,
				1351	0xeafd72ebdbecc17b},
				1352	{1, 0, 0, 0}},
				1353	{{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5,
				1354	0xa6d39677a7849276},
				1355	{0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf,
				1356	0x674f84749b0b8816},
				1357	{1, 0, 0, 0}},
				1358	{{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb,
				1359	0x4e769e7672c9ddad},
				1360	{0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281,
				1361	0x42b99082de830663},
				1362	{1, 0, 0, 0}},
				1363	{{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478,
				1364	0x78878ef61c6ce04d},
				1365	{0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def,
				1366	0xb6cb3f5d7b72c321},
				1367	{1, 0, 0, 0}},
				1368	{{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae,
				1369	0x0c88bc4d716b1287},
				1370	{0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa,
				1371	0xdd5ddea3f3901dc6},
				1372	{1, 0, 0, 0}},
				1373	{{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3,
				1374	0x68f344af6b317466},
				1375	{0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3,
				1376	0x31b9c405f8540a20},
				1377	{1, 0, 0, 0}},
				1378	{{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0,
				1379	0x4052bf4b6f461db9},
				1380	{0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8,
				1381	0xfecf4d5190b0fc61},
				1382	{1, 0, 0, 0}},
				1383	{{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a,
				1384	0x1eddbae2c802e41a},
				1385	{0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0,
				1386	0x43104d86560ebcfc},
				1387	{1, 0, 0, 0}},
				1388	{{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a,
				1389	0xb48e26b484f7a21c},
				1390	{0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668,
				1391	0xfac015404d4d3dab},
				1392	{1, 0, 0, 0}}},
				1393	{{{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}},
				1394	{{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da,
				1395	0x7fe36b40af22af89},
				1396	{0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1,
				1397	0xe697d45825b63624},
				1398	{1, 0, 0, 0}},
				1399	{{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902,
				1400	0x4a5b506612a677a6},
				1401	{0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40,
				1402	0xeb13461ceac089f1},
				1403	{1, 0, 0, 0}},
				1404	{{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857,
				1405	0x0781b8291c6a220a},
				1406	{0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434,
				1407	0x690cde8df0151593},
				1408	{1, 0, 0, 0}},
				1409	{{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326,
				1410	0x8a535f566ec73617},
				1411	{0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf,
				1412	0x0455c08468b08bd7},
				1413	{1, 0, 0, 0}},
				1414	{{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279,
				1415	0x06bada7ab77f8276},
				1416	{0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70,
				1417	0x5b476dfd0e6cb18a},
				1418	{1, 0, 0, 0}},
				1419	{{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8,
				1420	0x3e29864e8a2ec908},
				1421	{0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed,
				1422	0x239b90ea3dc31e7e},
				1423	{1, 0, 0, 0}},
				1424	{{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4,
				1425	0x820f4dd949f72ff7},
				1426	{0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3,
				1427	0x140406ec783a05ec},
				1428	{1, 0, 0, 0}},
				1429	{{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe,
				1430	0x68f6b8542783dfee},
				1431	{0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028,
				1432	0xcbe1feba92e40ce6},
				1433	{1, 0, 0, 0}},
				1434	{{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927,
				1435	0xd0b2f94d2f420109},
				1436	{0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a,
				1437	0x971459828b0719e5},
				1438	{1, 0, 0, 0}},
				1439	{{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687,
				1440	0x961610004a866aba},
				1441	{0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c,
				1442	0x7acb9fadcee75e44},
				1443	{1, 0, 0, 0}},
				1444	{{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea,
				1445	0x24eb9acca333bf5b},
				1446	{0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d,
				1447	0x69f891c5acd079cc},
				1448	{1, 0, 0, 0}},
				1449	{{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514,
				1450	0xe51f547c5972a107},
				1451	{0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06,
				1452	0x1c309a2b25bb1387},
				1453	{1, 0, 0, 0}},
				1454	{{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828,
				1455	0x20b87b8aa2c4e503},
				1456	{0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044,
				1457	0xf5c6fa49919776be},
				1458	{1, 0, 0, 0}},
				1459	{{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56,
				1460	0x1ed7d1b9332010b9},
				1461	{0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24,
				1462	0x3a2b03f03217257a},
				1463	{1, 0, 0, 0}},
				1464	{{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b,
				1465	0x15fee545c78dd9f6},
				1466	{0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb,
				1467	0x4ab5b6b2b8753f81},
				1468	{1, 0, 0, 0}}}};
				1469
				1470	/* select_point selects the \|idx\|th point from a precomputation table and
				1471	* copies it to out. */
				1472	static void select_point(const u64 idx, unsigned int size,
				1473	const smallfelem pre_comp[16][3], smallfelem out[3]) {
				1474	unsigned i, j;
				1475	u64 *outlimbs = &out[0][0];
				1476	memset(outlimbs, 0, 3 * sizeof(smallfelem));
				1477
				1478	for (i = 0; i < size; i++) {
				1479	const u64 inlimbs = (u64 )&pre_comp[i][0][0];
				1480	u64 mask = i ^ idx;
				1481	mask \|= mask >> 4;
				1482	mask \|= mask >> 2;
				1483	mask \|= mask >> 1;
				1484	mask &= 1;
				1485	mask--;
				1486	for (j = 0; j < NLIMBS * 3; j++) {
				1487	outlimbs[j] \|= inlimbs[j] & mask;
				1488	}
				1489	}
				1490	}
				1491
				1492	/* get_bit returns the \|i\|th bit in \|in\| */
				1493	static char get_bit(const felem_bytearray in, int i) {
				1494	if (i < 0 \|\| i >= 256) {
				1495	return 0;
				1496	}
				1497	return (in[i >> 3] >> (i & 7)) & 1;
				1498	}
				1499
				1500	/* Interleaved point multiplication using precomputed point multiples: The
				1501	* small point multiples 0P, 1P, ..., 17*P are in pre_comp[], the scalars
				1502	* in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
				1503	* generator, using certain (large) precomputed multiples in g_pre_comp.
				1504	* Output point (X, Y, Z) is stored in x_out, y_out, z_out. */
				1505	static void batch_mul(felem x_out, felem y_out, felem z_out,
				1506	const felem_bytearray scalars[],
				1507	const unsigned num_points, const u8 *g_scalar,
				1508	const int mixed, const smallfelem pre_comp[][17][3],
				1509	const smallfelem g_pre_comp[2][16][3]) {
				1510	int i, skip;
				1511	unsigned num, gen_mul = (g_scalar != NULL);
				1512	felem nq[3], ftmp;
				1513	smallfelem tmp[3];
				1514	u64 bits;
				1515	u8 sign, digit;
				1516
				1517	/* set nq to the point at infinity */
				1518	memset(nq, 0, 3 * sizeof(felem));
				1519
				1520	/* Loop over all scalars msb-to-lsb, interleaving additions of multiples
				1521	* of the generator (two in each of the last 32 rounds) and additions of
				1522	* other points multiples (every 5th round). */
				1523
				1524	skip = 1; /* save two point operations in the first
				1525	* round */
				1526	for (i = (num_points ? 255 : 31); i >= 0; --i) {
				1527	/* double */
				1528	if (!skip) {
				1529	point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
				1530	}
				1531
				1532	/* add multiples of the generator */
				1533	if (gen_mul && i <= 31) {
				1534	/* first, look 32 bits upwards */
				1535	bits = get_bit(g_scalar, i + 224) << 3;
				1536	bits \|= get_bit(g_scalar, i + 160) << 2;
				1537	bits \|= get_bit(g_scalar, i + 96) << 1;
				1538	bits \|= get_bit(g_scalar, i + 32);
				1539	/* select the point to add, in constant time */
				1540	select_point(bits, 16, g_pre_comp[1], tmp);
				1541
				1542	if (!skip) {
				1543	/* Arg 1 below is for "mixed" */
				1544	point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1, tmp[0], tmp[1],
				1545	tmp[2]);
				1546	} else {
				1547	smallfelem_expand(nq[0], tmp[0]);
				1548	smallfelem_expand(nq[1], tmp[1]);
				1549	smallfelem_expand(nq[2], tmp[2]);
				1550	skip = 0;
				1551	}
				1552
				1553	/* second, look at the current position */
				1554	bits = get_bit(g_scalar, i + 192) << 3;
				1555	bits \|= get_bit(g_scalar, i + 128) << 2;
				1556	bits \|= get_bit(g_scalar, i + 64) << 1;
				1557	bits \|= get_bit(g_scalar, i);
				1558	/* select the point to add, in constant time */
				1559	select_point(bits, 16, g_pre_comp[0], tmp);
				1560	/* Arg 1 below is for "mixed" */
				1561	point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1, tmp[0], tmp[1],
				1562	tmp[2]);
				1563	}
				1564
				1565	/* do other additions every 5 doublings */
				1566	if (num_points && (i % 5 == 0)) {
				1567	/* loop over all scalars */
				1568	for (num = 0; num < num_points; ++num) {
				1569	bits = get_bit(scalars[num], i + 4) << 5;
				1570	bits \|= get_bit(scalars[num], i + 3) << 4;
				1571	bits \|= get_bit(scalars[num], i + 2) << 3;
				1572	bits \|= get_bit(scalars[num], i + 1) << 2;
				1573	bits \|= get_bit(scalars[num], i) << 1;
				1574	bits \|= get_bit(scalars[num], i - 1);
				1575	ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
				1576
				1577	/* select the point to add or subtract, in constant time. */
				1578	select_point(digit, 17, pre_comp[num], tmp);
				1579	smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative
				1580	* point */
				1581	copy_small_conditional(ftmp, tmp[1], (((limb)sign) - 1));
				1582	felem_contract(tmp[1], ftmp);
				1583
				1584	if (!skip) {
				1585	point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], mixed, tmp[0],
				1586	tmp[1], tmp[2]);
				1587	} else {
				1588	smallfelem_expand(nq[0], tmp[0]);
				1589	smallfelem_expand(nq[1], tmp[1]);
				1590	smallfelem_expand(nq[2], tmp[2]);
				1591	skip = 0;
				1592	}
				1593	}
				1594	}
				1595	}
				1596	felem_assign(x_out, nq[0]);
				1597	felem_assign(y_out, nq[1]);
				1598	felem_assign(z_out, nq[2]);
				1599	}
				1600
				1601	/* Precomputation for the group generator. */
				1602	typedef struct {
				1603	smallfelem g_pre_comp[2][16][3];
				1604	int references;
				1605	} NISTP256_PRE_COMP;
				1606
				1607	/******************************************************************************/
				1608	/*
				1609	* OPENSSL EC_METHOD FUNCTIONS
				1610	*/
				1611
				1612	int ec_GFp_nistp256_group_init(EC_GROUP *group) {
				1613	int ret = ec_GFp_simple_group_init(group);
				1614	group->a_is_minus3 = 1;
				1615	return ret;
				1616	}
				1617
				1618	int ec_GFp_nistp256_group_set_curve(EC_GROUP group, const BIGNUM p,
				1619	const BIGNUM a, const BIGNUM b,
				1620	BN_CTX *ctx) {
				1621	int ret = 0;
				1622	BN_CTX *new_ctx = NULL;
				1623	BIGNUM curve_p, curve_a, *curve_b;
				1624
				1625	if (ctx == NULL) {
				1626	if ((ctx = new_ctx = BN_CTX_new()) == NULL) {
				1627	return 0;
				1628	}
				1629	}
				1630	BN_CTX_start(ctx);
				1631	if (((curve_p = BN_CTX_get(ctx)) == NULL) \|\|
				1632	((curve_a = BN_CTX_get(ctx)) == NULL) \|\|
				1633	((curve_b = BN_CTX_get(ctx)) == NULL)) {
				1634	goto err;
				1635	}
				1636	BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p);
				1637	BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a);
				1638	BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b);
				1639	if (BN_cmp(curve_p, p) \|\|
				1640	BN_cmp(curve_a, a) \|\|
				1641	BN_cmp(curve_b, b)) {
				1642	OPENSSL_PUT_ERROR(EC, ec_GFp_nistp256_group_set_curve,
				1643	EC_R_WRONG_CURVE_PARAMETERS);
				1644	goto err;
				1645	}
				1646	ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
				1647
				1648	err:
				1649	BN_CTX_end(ctx);
				1650	if (new_ctx != NULL) {
				1651	BN_CTX_free(new_ctx);
				1652	}
				1653	return ret;
				1654	}
				1655
				1656	/* Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
				1657	* (X/Z^2, Y/Z^3). */
				1658	int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
				1659	const EC_POINT *point,
				1660	BIGNUM x, BIGNUM y,
				1661	BN_CTX *ctx) {
				1662	felem z1, z2, x_in, y_in;
				1663	smallfelem x_out, y_out;
				1664	longfelem tmp;
				1665
				1666	if (EC_POINT_is_at_infinity(group, point)) {
				1667	OPENSSL_PUT_ERROR(EC, ec_GFp_nistp256_point_get_affine_coordinates,
				1668	EC_R_POINT_AT_INFINITY);
				1669	return 0;
				1670	}
				1671	if (!BN_to_felem(x_in, &point->X) \|\|
				1672	!BN_to_felem(y_in, &point->Y) \|\|
				1673	!BN_to_felem(z1, &point->Z)) {
				1674	return 0;
				1675	}
				1676	felem_inv(z2, z1);
				1677	felem_square(tmp, z2);
				1678	felem_reduce(z1, tmp);
				1679	felem_mul(tmp, x_in, z1);
				1680	felem_reduce(x_in, tmp);
				1681	felem_contract(x_out, x_in);
				1682	if (x != NULL && !smallfelem_to_BN(x, x_out)) {
				1683	OPENSSL_PUT_ERROR(EC, ec_GFp_nistp256_point_get_affine_coordinates,
				1684	ERR_R_BN_LIB);
				1685	return 0;
				1686	}
				1687	felem_mul(tmp, z1, z2);
				1688	felem_reduce(z1, tmp);
				1689	felem_mul(tmp, y_in, z1);
				1690	felem_reduce(y_in, tmp);
				1691	felem_contract(y_out, y_in);
				1692	if (y != NULL && !smallfelem_to_BN(y, y_out)) {
				1693	OPENSSL_PUT_ERROR(EC, ec_GFp_nistp256_point_get_affine_coordinates,
				1694	ERR_R_BN_LIB);
				1695	return 0;
				1696	}
				1697	return 1;
				1698	}
				1699
				1700	/* points below is of size \|num\|, and tmp_smallfelems is of size \|num+1\| */
				1701	static void make_points_affine(size_t num, smallfelem points[][3],
				1702	smallfelem tmp_smallfelems[]) {
				1703	/* Runs in constant time, unless an input is the point at infinity (which
				1704	* normally shouldn't happen). */
				1705	ec_GFp_nistp_points_make_affine_internal(
				1706	num, points, sizeof(smallfelem), tmp_smallfelems,
				1707	(void ()(void ))smallfelem_one,
				1708	(int ()(const void ))smallfelem_is_zero_int,
				1709	(void ()(void , const void *))smallfelem_assign,
				1710	(void ()(void , const void *))smallfelem_square_contract,
				1711	(void ()(void , const void , const void ))smallfelem_mul_contract,
				1712	(void ()(void , const void *))smallfelem_inv_contract,
				1713	/* nothing to contract */
				1714	(void ()(void , const void *))smallfelem_assign);
				1715	}
				1716
				1717	/* Computes scalargenerator + \sum scalars[i]points[i], ignoring NULL
				1718	* values Result is stored in r (r can equal one of the inputs). */
				1719	int ec_GFp_nistp256_points_mul(const EC_GROUP group, EC_POINT r,
				1720	const BIGNUM *scalar, size_t num,
				1721	const EC_POINT *points[],
				1722	const BIGNUM scalars[], BN_CTX ctx) {
				1723	int ret = 0;
				1724	int j;
				1725	int mixed = 0;
				1726	BN_CTX *new_ctx = NULL;
				1727	BIGNUM x, y, z, tmp_scalar;
				1728	felem_bytearray g_secret;
				1729	felem_bytearray *secrets = NULL;
				1730	smallfelem(*pre_comp)[17][3] = NULL;
				1731	smallfelem *tmp_smallfelems = NULL;
				1732	felem_bytearray tmp;
				1733	unsigned i, num_bytes;
				1734	int have_pre_comp = 0;
				1735	size_t num_points = num;
				1736	smallfelem x_in, y_in, z_in;
				1737	felem x_out, y_out, z_out;
				1738	const smallfelem(*g_pre_comp)[16][3] = NULL;
				1739	EC_POINT *generator = NULL;
				1740	const EC_POINT *p = NULL;
				1741	const BIGNUM *p_scalar = NULL;
				1742
				1743	if (ctx == NULL) {
				1744	ctx = new_ctx = BN_CTX_new();
				1745	if (ctx == NULL) {
				1746	return 0;
				1747	}
				1748	}
				1749
				1750	BN_CTX_start(ctx);
				1751	if ((x = BN_CTX_get(ctx)) == NULL \|\|
				1752	(y = BN_CTX_get(ctx)) == NULL \|\|
				1753	(z = BN_CTX_get(ctx)) == NULL \|\|
				1754	(tmp_scalar = BN_CTX_get(ctx)) == NULL) {
				1755	goto err;
				1756	}
				1757
				1758	if (scalar != NULL) {
				1759	/* try to use the standard precomputation */
				1760	g_pre_comp = &gmul[0];
				1761	generator = EC_POINT_new(group);
				1762	if (generator == NULL) {
				1763	goto err;
				1764	}
				1765	/* get the generator from precomputation */
				1766	if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) \|\|
				1767	!smallfelem_to_BN(y, g_pre_comp[0][1][1]) \|\|
				1768	!smallfelem_to_BN(z, g_pre_comp[0][1][2])) {
				1769	OPENSSL_PUT_ERROR(EC, ec_GFp_nistp256_points_mul, ERR_R_BN_LIB);
				1770	goto err;
				1771	}
				1772	if (!ec_point_set_Jprojective_coordinates_GFp(group, generator, x, y, z,
				1773	ctx)) {
				1774	goto err;
				1775	}
				1776	if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
				1777	/* precomputation matches generator */
				1778	have_pre_comp = 1;
				1779	} else {
				1780	/* we don't have valid precomputation: treat the generator as a
				1781	* random point. */
				1782	num_points++;
				1783	}
				1784	}
				1785
				1786	if (num_points > 0) {
				1787	if (num_points >= 3) {
				1788	/* unless we precompute multiples for just one or two points,
				1789	* converting those into affine form is time well spent */
				1790	mixed = 1;
				1791	}
				1792	secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
				1793	pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(smallfelem));
				1794	if (mixed) {
				1795	tmp_smallfelems =
				1796	OPENSSL_malloc((num_points * 17 + 1) * sizeof(smallfelem));
				1797	}
				1798	if (secrets == NULL \|\| pre_comp == NULL \|\|
				1799	(mixed && tmp_smallfelems == NULL)) {
				1800	OPENSSL_PUT_ERROR(EC, ec_GFp_nistp256_points_mul, ERR_R_MALLOC_FAILURE);
				1801	goto err;
				1802	}
				1803
				1804	/* we treat NULL scalars as 0, and NULL points as points at infinity,
				1805	* i.e., they contribute nothing to the linear combination. */
				1806	memset(secrets, 0, num_points * sizeof(felem_bytearray));
				1807	memset(pre_comp, 0, num_points * 17 * 3 * sizeof(smallfelem));
				1808	for (i = 0; i < num_points; ++i) {
				1809	if (i == num) {
				1810	/* we didn't have a valid precomputation, so we pick the generator. */
				1811	p = EC_GROUP_get0_generator(group);
				1812	p_scalar = scalar;
				1813	} else {
				1814	/* the i^th point */
				1815	p = points[i];
				1816	p_scalar = scalars[i];
				1817	}
				1818	if (p_scalar != NULL && p != NULL) {
				1819	/* reduce scalar to 0 <= scalar < 2^256 */
				1820	if (BN_num_bits(p_scalar) > 256 \|\| BN_is_negative(p_scalar)) {
				1821	/* this is an unusual input, and we don't guarantee
				1822	* constant-timeness. */
				1823	if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) {
				1824	OPENSSL_PUT_ERROR(EC, ec_GFp_nistp256_points_mul, ERR_R_BN_LIB);
				1825	goto err;
				1826	}
				1827	num_bytes = BN_bn2bin(tmp_scalar, tmp);
				1828	} else {
				1829	num_bytes = BN_bn2bin(p_scalar, tmp);
				1830	}
				1831	flip_endian(secrets[i], tmp, num_bytes);
				1832	/* precompute multiples */
				1833	if (!BN_to_felem(x_out, &p->X) \|\|
				1834	!BN_to_felem(y_out, &p->Y) \|\|
				1835	!BN_to_felem(z_out, &p->Z)) {
				1836	goto err;
				1837	}
				1838	felem_shrink(pre_comp[i][1][0], x_out);
				1839	felem_shrink(pre_comp[i][1][1], y_out);
				1840	felem_shrink(pre_comp[i][1][2], z_out);
				1841	for (j = 2; j <= 16; ++j) {
				1842	if (j & 1) {
				1843	point_add_small(pre_comp[i][j][0], pre_comp[i][j][1],
				1844	pre_comp[i][j][2], pre_comp[i][1][0],
				1845	pre_comp[i][1][1], pre_comp[i][1][2],
				1846	pre_comp[i][j - 1][0], pre_comp[i][j - 1][1],
				1847	pre_comp[i][j - 1][2]);
				1848	} else {
				1849	point_double_small(pre_comp[i][j][0], pre_comp[i][j][1],
				1850	pre_comp[i][j][2], pre_comp[i][j / 2][0],
				1851	pre_comp[i][j / 2][1], pre_comp[i][j / 2][2]);
				1852	}
				1853	}
				1854	}
				1855	}
				1856	if (mixed) {
				1857	make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
				1858	}
				1859	}
				1860
				1861	/* the scalar for the generator */
				1862	if (scalar != NULL && have_pre_comp) {
				1863	memset(g_secret, 0, sizeof(g_secret));
				1864	/* reduce scalar to 0 <= scalar < 2^256 */
				1865	if (BN_num_bits(scalar) > 256 \|\| BN_is_negative(scalar)) {
				1866	/* this is an unusual input, and we don't guarantee
				1867	* constant-timeness. */
				1868	if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) {
				1869	OPENSSL_PUT_ERROR(EC, ec_GFp_nistp256_points_mul, ERR_R_BN_LIB);
				1870	goto err;
				1871	}
				1872	num_bytes = BN_bn2bin(tmp_scalar, tmp);
				1873	} else {
				1874	num_bytes = BN_bn2bin(scalar, tmp);
				1875	}
				1876	flip_endian(g_secret, tmp, num_bytes);
				1877	/* do the multiplication with generator precomputation */
				1878	batch_mul(x_out, y_out, z_out, (const felem_bytearray(*))secrets,
				1879	num_points, g_secret, mixed, (const smallfelem(*)[17][3])pre_comp,
				1880	g_pre_comp);
				1881	} else {
				1882	/* do the multiplication without generator precomputation */
				1883	batch_mul(x_out, y_out, z_out, (const felem_bytearray(*))secrets,
				1884	num_points, NULL, mixed, (const smallfelem(*)[17][3])pre_comp,
				1885	NULL);
				1886	}
				1887
				1888	/* reduce the output to its unique minimal representation */
				1889	felem_contract(x_in, x_out);
				1890	felem_contract(y_in, y_out);
				1891	felem_contract(z_in, z_out);
				1892	if (!smallfelem_to_BN(x, x_in) \|\|
				1893	!smallfelem_to_BN(y, y_in) \|\|
				1894	!smallfelem_to_BN(z, z_in)) {
				1895	OPENSSL_PUT_ERROR(EC, ec_GFp_nistp256_points_mul, ERR_R_BN_LIB);
				1896	goto err;
				1897	}
				1898	ret = ec_point_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
				1899
				1900	err:
				1901	BN_CTX_end(ctx);
				1902	if (generator != NULL) {
				1903	EC_POINT_free(generator);
				1904	}
				1905	if (new_ctx != NULL) {
				1906	BN_CTX_free(new_ctx);
				1907	}
				1908	if (secrets != NULL) {
				1909	OPENSSL_free(secrets);
				1910	}
				1911	if (pre_comp != NULL) {
				1912	OPENSSL_free(pre_comp);
				1913	}
				1914	if (tmp_smallfelems != NULL) {
				1915	OPENSSL_free(tmp_smallfelems);
				1916	}
				1917	return ret;
				1918	}
				1919
				1920	const EC_METHOD *EC_GFp_nistp256_method(void) {
				1921	static const EC_METHOD ret = {
				1922	EC_FLAGS_DEFAULT_OCT,
				1923	ec_GFp_nistp256_group_init,
				1924	ec_GFp_simple_group_finish,
				1925	ec_GFp_simple_group_clear_finish,
				1926	ec_GFp_simple_group_copy, ec_GFp_nistp256_group_set_curve,
				1927	ec_GFp_simple_group_get_curve, ec_GFp_simple_group_get_degree,
				1928	ec_GFp_simple_group_check_discriminant, ec_GFp_simple_point_init,
				1929	ec_GFp_simple_point_finish, ec_GFp_simple_point_clear_finish,
				1930	ec_GFp_simple_point_copy, ec_GFp_simple_point_set_to_infinity,
				1931	ec_GFp_simple_set_Jprojective_coordinates_GFp,
				1932	ec_GFp_simple_get_Jprojective_coordinates_GFp,
				1933	ec_GFp_simple_point_set_affine_coordinates,
				1934	ec_GFp_nistp256_point_get_affine_coordinates,
				1935	0 /* point_set_compressed_coordinates /, 0 / point2oct */,
				1936	0 /* oct2point */, ec_GFp_simple_add, ec_GFp_simple_dbl,
				1937	ec_GFp_simple_invert, ec_GFp_simple_is_at_infinity,
				1938	ec_GFp_simple_is_on_curve, ec_GFp_simple_cmp, ec_GFp_simple_make_affine,
				1939	ec_GFp_simple_points_make_affine, ec_GFp_nistp256_points_mul,
				1940	0 /* precompute_mult /, 0 / have_precompute_mult */,
				1941	ec_GFp_simple_field_mul, ec_GFp_simple_field_sqr, 0 /* field_div */,
				1942	0 /* field_encode /, 0 / field_decode /, 0 / field_set_to_one */
				1943	};
				1944
				1945	return &ret;
				1946	}
				1947
				1948	#endif /* 64_BIT && !WINDOWS */