Implements SIKE/p434
* CECPQ2b will use SIKE/p434 instead of SIKE/p503
* KEM uses SHA256 instead of HMAC-256
* implements new starting curve: y^2=x^3 + 6x^2 + x
* adds optimized implementation for aarch64
* adds optimized implementation for AMD64
which do not support MULX/ADOX/ADCX
* syncs the SIKE test code with the NIST Round 2
specification.
* removes references to field size from variables
names, tests and defines.
Change-Id: I5359c6c62ad342354c6d337f7ee525158586ec93
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/36704
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/CMakeLists.txt b/crypto/CMakeLists.txt
index 955dd8b..e97a4e1 100644
--- a/crypto/CMakeLists.txt
+++ b/crypto/CMakeLists.txt
@@ -411,7 +411,7 @@
../third_party/fiat/curve25519.c
../third_party/sike/fpx.c
../third_party/sike/isogeny.c
- ../third_party/sike/P503.c
+ ../third_party/sike/params.c
../third_party/sike/sike.c
../third_party/sike/asm/fp_generic.c
diff --git a/ssl/ssl_key_share.cc b/ssl/ssl_key_share.cc
index 2bf177b..826fb1a 100644
--- a/ssl/ssl_key_share.cc
+++ b/ssl/ssl_key_share.cc
@@ -319,18 +319,18 @@
uint8_t *out_alert, Span<const uint8_t> peer_key) override {
uint8_t public_x25519[32];
uint8_t private_x25519[32];
- uint8_t sike_ciphertext[SIKEp503_CT_BYTESZ] = {0};
+ uint8_t sike_ciphertext[SIKE_CT_BYTESZ] = {0};
*out_alert = SSL_AD_INTERNAL_ERROR;
- if (peer_key.size() != sizeof(public_x25519) + SIKEp503_PUB_BYTESZ) {
+ if (peer_key.size() != sizeof(public_x25519) + SIKE_PUB_BYTESZ) {
*out_alert = SSL_AD_DECODE_ERROR;
OPENSSL_PUT_ERROR(SSL, SSL_R_BAD_ECPOINT);
return false;
}
Array<uint8_t> secret;
- if (!secret.Init(sizeof(private_x25519_) + SIKEp503_SS_BYTESZ)) {
+ if (!secret.Init(sizeof(private_x25519_) + SIKE_SS_BYTESZ)) {
OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE);
return false;
}
@@ -357,12 +357,12 @@
*out_alert = SSL_AD_INTERNAL_ERROR;
Array<uint8_t> secret;
- if (!secret.Init(sizeof(private_x25519_) + SIKEp503_SS_BYTESZ)) {
+ if (!secret.Init(sizeof(private_x25519_) + SIKE_SS_BYTESZ)) {
OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE);
return false;
}
- if (peer_key.size() != 32 + SIKEp503_CT_BYTESZ ||
+ if (peer_key.size() != 32 + SIKE_CT_BYTESZ ||
!X25519(secret.data(), private_x25519_, peer_key.data())) {
*out_alert = SSL_AD_DECODE_ERROR;
OPENSSL_PUT_ERROR(SSL, SSL_R_BAD_ECPOINT);
@@ -377,8 +377,8 @@
private:
uint8_t private_x25519_[32];
- uint8_t private_sike_[SIKEp503_PRV_BYTESZ];
- uint8_t public_sike_[SIKEp503_PUB_BYTESZ];
+ uint8_t private_sike_[SIKE_PRV_BYTESZ];
+ uint8_t public_sike_[SIKE_PUB_BYTESZ];
};
CONSTEXPR_ARRAY NamedGroup kNamedGroups[] = {
diff --git a/ssl/test/runner/key_agreement.go b/ssl/test/runner/key_agreement.go
index 0d6405f..f4789b6 100644
--- a/ssl/test/runner/key_agreement.go
+++ b/ssl/test/runner/key_agreement.go
@@ -434,7 +434,7 @@
return preMasterSecret, nil
}
-// cecpq2BCurve implements CECPQ2b, which is SIKEp503 combined with X25519.
+// cecpq2BCurve implements CECPQ2b, which is SIKE combined with X25519.
type cecpq2BCurve struct {
// Both public key and shared secret size
x25519PrivateKey [32]byte
diff --git a/ssl/test/runner/sike/arith.go b/ssl/test/runner/sike/arith.go
index 338a714..10a2ca6 100644
--- a/ssl/test/runner/sike/arith.go
+++ b/ssl/test/runner/sike/arith.go
@@ -22,22 +22,22 @@
func fpAddRdc(z, x, y *Fp) {
var carry uint64
- // z=x+y % p503
+ // z=x+y % p
for i := 0; i < FP_WORDS; i++ {
z[i], carry = bits.Add64(x[i], y[i], carry)
}
- // z = z - p503x2
+ // z = z - pX2
carry = 0
for i := 0; i < FP_WORDS; i++ {
- z[i], carry = bits.Sub64(z[i], p503x2[i], carry)
+ z[i], carry = bits.Sub64(z[i], pX2[i], carry)
}
- // if z<0 add p503x2 back
+ // if z<0 add pX2 back
mask := uint64(0 - carry)
carry = 0
for i := 0; i < FP_WORDS; i++ {
- z[i], carry = bits.Add64(z[i], p503x2[i]&mask, carry)
+ z[i], carry = bits.Add64(z[i], pX2[i]&mask, carry)
}
}
@@ -45,16 +45,16 @@
func fpSubRdc(z, x, y *Fp) {
var borrow uint64
- // z = z - p503x2
+ // z = z - pX2
for i := 0; i < FP_WORDS; i++ {
z[i], borrow = bits.Sub64(x[i], y[i], borrow)
}
- // if z<0 add p503x2 back
+ // if z<0 add pX2 back
mask := uint64(0 - borrow)
borrow = 0
for i := 0; i < FP_WORDS; i++ {
- z[i], borrow = bits.Add64(z[i], p503x2[i]&mask, borrow)
+ z[i], borrow = bits.Add64(z[i], pX2[i]&mask, borrow)
}
}
@@ -62,14 +62,14 @@
func fpRdcP(x *Fp) {
var borrow, mask uint64
for i := 0; i < FP_WORDS; i++ {
- x[i], borrow = bits.Sub64(x[i], p503[i], borrow)
+ x[i], borrow = bits.Sub64(x[i], p[i], borrow)
}
// Sets all bits if borrow = 1
mask = 0 - borrow
borrow = 0
for i := 0; i < FP_WORDS; i++ {
- x[i], borrow = bits.Add64(x[i], p503[i]&mask, borrow)
+ x[i], borrow = bits.Add64(x[i], p[i]&mask, borrow)
}
}
@@ -123,12 +123,12 @@
var hi, lo uint64
var count int
- count = 3 // number of 0 digits in the least significat part of p503 + 1
+ count = 3 // number of 0 digits in the least significat part of p + 1
for i := 0; i < FP_WORDS; i++ {
for j := 0; j < i; j++ {
if j < (i - count + 1) {
- hi, lo = bits.Mul64(z[j], p503p1[i-j])
+ hi, lo = bits.Mul64(z[j], p1[i-j])
v, carry = bits.Add64(lo, v, 0)
u, carry = bits.Add64(hi, u, carry)
t += carry
@@ -150,7 +150,7 @@
}
for j := i - FP_WORDS + 1; j < FP_WORDS; j++ {
if j < (FP_WORDS - count) {
- hi, lo = bits.Mul64(z[j], p503p1[i-j])
+ hi, lo = bits.Mul64(z[j], p1[i-j])
v, carry = bits.Add64(lo, v, 0)
u, carry = bits.Add64(hi, u, carry)
t += carry
@@ -188,7 +188,7 @@
mask = 0 - borrow
borrow = 0
for i := FP_WORDS; i < 2*FP_WORDS; i++ {
- z[i], borrow = bits.Add64(z[i], p503[i-FP_WORDS]&mask, borrow)
+ z[i], borrow = bits.Add64(z[i], p[i-FP_WORDS]&mask, borrow)
}
}
@@ -210,25 +210,34 @@
//
// Allowed to overlap x with dest.
// All values in Montgomery domains
+// Set dest = x^(2^k), for k >= 1, by repeated squarings.
func p34(dest, x *Fp) {
+ var lookup [16]Fp
- // Set dest = x^(2^k), for k >= 1, by repeated squarings.
- pow2k := func(dest, x *Fp, k uint8) {
- fpMulRdc(dest, x, x)
- for i := uint8(1); i < k; i++ {
- fpMulRdc(dest, dest, dest)
- }
- }
- // Sliding-window strategy computed with etc/scripts/sliding_window_strat_calc.py
- //
// This performs sum(powStrategy) + 1 squarings and len(lookup) + len(mulStrategy)
// multiplications.
- powStrategy := []uint8{1, 12, 5, 5, 2, 7, 11, 3, 8, 4, 11, 4, 7, 5, 6, 3, 7, 5, 7, 2, 12, 5, 6, 4, 6, 8, 6, 4, 7, 5, 5, 8, 5, 8, 5, 5, 8, 9, 3, 6, 2, 10, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3}
- mulStrategy := []uint8{0, 12, 11, 10, 0, 1, 8, 3, 7, 1, 8, 3, 6, 7, 14, 2, 14, 14, 9, 0, 13, 9, 15, 5, 12, 7, 13, 7, 15, 6, 7, 9, 0, 5, 7, 6, 8, 8, 3, 7, 0, 10, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 3}
+ powStrategy := []uint8{
+ 0x03, 0x0A, 0x07, 0x05, 0x06, 0x05, 0x03, 0x08, 0x04, 0x07,
+ 0x05, 0x06, 0x04, 0x05, 0x09, 0x06, 0x03, 0x0B, 0x05, 0x05,
+ 0x02, 0x08, 0x04, 0x07, 0x07, 0x08, 0x05, 0x06, 0x04, 0x08,
+ 0x05, 0x02, 0x0A, 0x06, 0x05, 0x04, 0x08, 0x05, 0x05, 0x05,
+ 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
+ 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
+ 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
+ 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x01}
+ mulStrategy := []uint8{
+ 0x02, 0x0F, 0x09, 0x08, 0x0E, 0x0C, 0x02, 0x08, 0x05, 0x0F,
+ 0x08, 0x0F, 0x06, 0x06, 0x03, 0x02, 0x00, 0x0A, 0x09, 0x0D,
+ 0x01, 0x0C, 0x03, 0x07, 0x01, 0x0A, 0x08, 0x0B, 0x02, 0x0F,
+ 0x0E, 0x01, 0x0B, 0x0C, 0x0E, 0x03, 0x0B, 0x0F, 0x0F, 0x0F,
+ 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
+ 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
+ 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
+ 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x00}
+ initialMul := uint8(8)
// Precompute lookup table of odd multiples of x for window
// size k=5.
- lookup := [16]Fp{}
var xx Fp
fpMulRdc(&xx, x, x)
lookup[0] = *x
@@ -239,9 +248,12 @@
// Now lookup = {x, x^3, x^5, ... }
// so that lookup[i] = x^{2*i + 1}
// so that lookup[k/2] = x^k, for odd k
- *dest = lookup[mulStrategy[0]]
- for i := uint8(1); i < uint8(len(powStrategy)); i++ {
- pow2k(dest, dest, powStrategy[i])
+ *dest = lookup[initialMul]
+ for i := uint8(0); i < uint8(len(powStrategy)); i++ {
+ fpMulRdc(dest, dest, dest)
+ for j := uint8(1); j < powStrategy[i]; j++ {
+ fpMulRdc(dest, dest, dest)
+ }
fpMulRdc(dest, dest, &lookup[mulStrategy[i]])
}
}
diff --git a/ssl/test/runner/sike/consts.go b/ssl/test/runner/sike/consts.go
index 0ecff52..9d68a4f 100644
--- a/ssl/test/runner/sike/consts.go
+++ b/ssl/test/runner/sike/consts.go
@@ -52,17 +52,21 @@
PublicKeySize int
// The shared secret size, in bytes.
SharedSecretSize int
+ // Defines A,C constant for starting curve Cy^2 = x^3 + Ax^2 + x
+ InitCurve ProjectiveCurveParameters
// 2- and 3-torsion group parameter definitions
A, B DomainParams
- // Precomputed identity element in the Fp2 in Montgomery domain
- OneFp2 Fp2
// Precomputed 1/2 in the Fp2 in Montgomery domain
HalfFp2 Fp2
+ // Precomputed identity element in the Fp2 in Montgomery domain
+ OneFp2 Fp2
// Length of SIKE secret message. Must be one of {24,32,40},
// depending on size of prime field used (see [SIKE], 1.4 and 5.1)
MsgLen int
// Length of SIKE ephemeral KEM key (see [SIKE], 1.4 and 5.1)
KemSize int
+ // Size of a ciphertext returned by encapsulation in bytes
+ CiphertextSize int
}
// Stores curve projective parameters equivalent to A/C. Meaning of the
@@ -130,172 +134,184 @@
// 110 - SIKE
KeyVariant_SIKE = 1<<2 | KeyVariant_SIDH_B
// Number of uint64 limbs used to store field element
- FP_WORDS = 8
+ FP_WORDS = 7
)
// Used internally by this package
// -------------------------------
-var p503 = Fp{
- 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xABFFFFFFFFFFFFFF,
- 0x13085BDA2211E7A0, 0x1B9BF6C87B7E7DAF, 0x6045C6BDDA77A4D0, 0x004066F541811E1E,
-}
+var (
+ p = Fp{
+ 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFDC1767AE2FFFFFF,
+ 0x7BC65C783158AEA3, 0x6CFC5FD681C52056, 0x2341F27177344,
+ }
-// 2*503
-var p503x2 = Fp{
- 0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x57FFFFFFFFFFFFFF,
- 0x2610B7B44423CF41, 0x3737ED90F6FCFB5E, 0xC08B8D7BB4EF49A0, 0x0080CDEA83023C3C,
-}
+ // 2*p434
+ pX2 = Fp{
+ 0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFB82ECF5C5FFFFFF,
+ 0xF78CB8F062B15D47, 0xD9F8BFAD038A40AC, 0x4683E4E2EE688,
+ }
-// p503 + 1
-var p503p1 = Fp{
- 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xAC00000000000000,
- 0x13085BDA2211E7A0, 0x1B9BF6C87B7E7DAF, 0x6045C6BDDA77A4D0, 0x004066F541811E1E,
-}
+ // p434 + 1
+ p1 = Fp{
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFDC1767AE3000000,
+ 0x7BC65C783158AEA3, 0x6CFC5FD681C52056, 0x0002341F27177344,
+ }
-// R^2=(2^512)^2 mod p
-var p503R2 = Fp{
- 0x5289A0CF641D011F, 0x9B88257189FED2B9, 0xA3B365D58DC8F17A, 0x5BC57AB6EFF168EC,
- 0x9E51998BD84D4423, 0xBF8999CBAC3B5695, 0x46E9127BCE14CDB6, 0x003F6CFCE8B81771,
-}
+ // R^2=(2^448)^2 mod p
+ R2 = Fp{
+ 0x28E55B65DCD69B30, 0xACEC7367768798C2, 0xAB27973F8311688D, 0x175CC6AF8D6C7C0B,
+ 0xABCD92BF2DDE347E, 0x69E16A61C7686D9A, 0x000025A89BCDD12A,
+ }
-// p503 + 1 left-shifted by 8, assuming little endianness
-var p503p1s8 = Fp{
- 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
- 0x085BDA2211E7A0AC, 0x9BF6C87B7E7DAF13, 0x45C6BDDA77A4D01B, 0x4066F541811E1E60,
-}
+ // 1/2 * R mod p
+ half = Fp2{
+ A: Fp{
+ 0x0000000000003A16, 0x0000000000000000, 0x0000000000000000, 0x5C87FA027E000000,
+ 0x6C00D27DAACFD66A, 0x74992A2A2FBBA086, 0x0000767753DE976D},
+ }
-// 1*R mod p
-var P503_OneFp2 = Fp2{
- A: Fp{
- 0x00000000000003F9, 0x0000000000000000, 0x0000000000000000, 0xB400000000000000,
- 0x63CB1A6EA6DED2B4, 0x51689D8D667EB37D, 0x8ACD77C71AB24142, 0x0026FBAEC60F5953},
-}
+ // 1*R mod p
+ one = Fp2{
+ A: Fp{
+ 0x000000000000742C, 0x0000000000000000, 0x0000000000000000, 0xB90FF404FC000000,
+ 0xD801A4FB559FACD4, 0xE93254545F77410C, 0x0000ECEEA7BD2EDA},
+ }
-// 1/2 * R mod p
-var P503_HalfFp2 = Fp2{
- A: Fp{
- 0x00000000000001FC, 0x0000000000000000, 0x0000000000000000, 0xB000000000000000,
- 0x3B69BB2464785D2A, 0x36824A2AF0FE9896, 0xF5899F427A94F309, 0x0033B15203C83BB8},
-}
+ // 6*R mod p
+ six = Fp2{
+ A: Fp{
+ 0x000000000002B90A, 0x0000000000000000, 0x0000000000000000, 0x5ADCCB2822000000,
+ 0x187D24F39F0CAFB4, 0x9D353A4D394145A0, 0x00012559A0403298},
+ }
-var Params SidhParams
+ Params SidhParams
+)
func init() {
Params = SidhParams{
// SIDH public key byte size.
- PublicKeySize: 378,
+ PublicKeySize: 330,
// SIDH shared secret byte size.
- SharedSecretSize: 126,
+ SharedSecretSize: 110,
+ InitCurve: ProjectiveCurveParameters{
+ A: six,
+ C: one,
+ },
A: DomainParams{
// The x-coordinate of PA
Affine_P: Fp2{
A: Fp{
- 0xE7EF4AA786D855AF, 0xED5758F03EB34D3B, 0x09AE172535A86AA9, 0x237B9CC07D622723,
- 0xE3A284CBA4E7932D, 0x27481D9176C5E63F, 0x6A323FF55C6E71BF, 0x002ECC31A6FB8773,
+ 0x05ADF455C5C345BF, 0x91935C5CC767AC2B, 0xAFE4E879951F0257, 0x70E792DC89FA27B1,
+ 0xF797F526BB48C8CD, 0x2181DB6131AF621F, 0x00000A1C08B1ECC4,
},
B: Fp{
- 0x64D02E4E90A620B8, 0xDAB8128537D4B9F1, 0x4BADF77B8A228F98, 0x0F5DBDF9D1FB7D1B,
- 0xBEC4DB288E1A0DCC, 0xE76A8665E80675DB, 0x6D6F252E12929463, 0x003188BD1463FACC,
+ 0x74840EB87CDA7788, 0x2971AA0ECF9F9D0B, 0xCB5732BDF41715D5, 0x8CD8E51F7AACFFAA,
+ 0xA7F424730D7E419F, 0xD671EB919A179E8C, 0x0000FFA26C5A924A,
},
},
// The x-coordinate of QA
Affine_Q: Fp2{
A: Fp{
- 0xB79D41025DE85D56, 0x0B867DA9DF169686, 0x740E5368021C827D, 0x20615D72157BF25C,
- 0xFF1590013C9B9F5B, 0xC884DCADE8C16CEA, 0xEBD05E53BF724E01, 0x0032FEF8FDA5748C,
+ 0xFEC6E64588B7273B, 0xD2A626D74CBBF1C6, 0xF8F58F07A78098C7, 0xE23941F470841B03,
+ 0x1B63EDA2045538DD, 0x735CFEB0FFD49215, 0x0001C4CB77542876,
},
B: Fp{
- 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
- 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+ 0xADB0F733C17FFDD6, 0x6AFFBD037DA0A050, 0x680EC43DB144E02F, 0x1E2E5D5FF524E374,
+ 0xE2DDA115260E2995, 0xA6E4B552E2EDE508, 0x00018ECCDDF4B53E,
},
},
// The x-coordinate of RA = PA-QA
Affine_R: Fp2{
A: Fp{
- 0x12E2E849AA0A8006, 0x41CF47008635A1E8, 0x9CD720A70798AED7, 0x42A820B42FCF04CF,
- 0x7BF9BAD32AAE88B1, 0xF619127A54090BBE, 0x1CB10D8F56408EAA, 0x001D6B54C3C0EDEB,
+ 0x01BA4DB518CD6C7D, 0x2CB0251FE3CC0611, 0x259B0C6949A9121B, 0x60E17AC16D2F82AD,
+ 0x3AA41F1CE175D92D, 0x413FBE6A9B9BC4F3, 0x00022A81D8D55643,
},
B: Fp{
- 0x34DB54931CBAAC36, 0x420A18CB8DD5F0C4, 0x32008C1A48C0F44D, 0x3B3BA772B1CFD44D,
- 0xA74B058FDAF13515, 0x095FC9CA7EEC17B4, 0x448E829D28F120F8, 0x00261EC3ED16A489,
+ 0xB8ADBC70FC82E54A, 0xEF9CDDB0D5FADDED, 0x5820C734C80096A0, 0x7799994BAA96E0E4,
+ 0x044961599E379AF8, 0xDB2B94FBF09F27E2, 0x0000B87FC716C0C6,
},
},
// Max size of secret key for 2-torsion group, corresponds to 2^e2 - 1
- SecretBitLen: 250,
+ SecretBitLen: 216,
// SecretBitLen in bytes.
- SecretByteLen: uint((250 + 7) / 8),
+ SecretByteLen: 27,
// 2-torsion group computation strategy
IsogenyStrategy: []uint32{
- 0x3D, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01,
- 0x01, 0x02, 0x01, 0x01, 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02,
- 0x01, 0x01, 0x02, 0x01, 0x01, 0x10, 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01,
- 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01,
- 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x1D, 0x10, 0x08, 0x04, 0x02, 0x01,
- 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x08, 0x04, 0x02,
- 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x0D, 0x08,
- 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01,
- 0x05, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x01},
+ 0x30, 0x1C, 0x10, 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01,
+ 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x08, 0x04,
+ 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01,
+ 0x02, 0x01, 0x01, 0x0D, 0x07, 0x04, 0x02, 0x01, 0x01, 0x02,
+ 0x01, 0x01, 0x03, 0x02, 0x01, 0x01, 0x01, 0x01, 0x05, 0x04,
+ 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x01,
+ 0x15, 0x0C, 0x07, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01,
+ 0x03, 0x02, 0x01, 0x01, 0x01, 0x01, 0x05, 0x03, 0x02, 0x01,
+ 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x01, 0x09, 0x05, 0x03,
+ 0x02, 0x01, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x01, 0x04,
+ 0x02, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01},
},
B: DomainParams{
// The x-coordinate of PB
Affine_P: Fp2{
A: Fp{
- 0x7EDE37F4FA0BC727, 0xF7F8EC5C8598941C, 0xD15519B516B5F5C8, 0xF6D5AC9B87A36282,
- 0x7B19F105B30E952E, 0x13BD8B2025B4EBEE, 0x7B96D27F4EC579A2, 0x00140850CAB7E5DE,
+ 0x6E5497556EDD48A3, 0x2A61B501546F1C05, 0xEB919446D049887D, 0x5864A4A69D450C4F,
+ 0xB883F276A6490D2B, 0x22CC287022D5F5B9, 0x0001BED4772E551F,
},
B: Fp{
- 0x7764909DAE7B7B2D, 0x578ABB16284911AB, 0x76E2BFD146A6BF4D, 0x4824044B23AA02F0,
- 0x1105048912A321F3, 0xB8A2E482CF0F10C1, 0x42FF7D0BE2152085, 0x0018E599C5223352,
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
},
},
// The x-coordinate of QB
Affine_Q: Fp2{
A: Fp{
- 0x4256C520FB388820, 0x744FD7C3BAAF0A13, 0x4B6A2DDDB12CBCB8, 0xE46826E27F427DF8,
- 0xFE4A663CD505A61B, 0xD6B3A1BAF025C695, 0x7C3BB62B8FCC00BD, 0x003AFDDE4A35746C,
+ 0xFAE2A3F93D8B6B8E, 0x494871F51700FE1C, 0xEF1A94228413C27C, 0x498FF4A4AF60BD62,
+ 0xB00AD2A708267E8A, 0xF4328294E017837F, 0x000034080181D8AE,
},
B: Fp{
0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
- 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
},
},
// The x-coordinate of RB = PB - QB
Affine_R: Fp2{
A: Fp{
- 0x75601CD1E6C0DFCB, 0x1A9007239B58F93E, 0xC1F1BE80C62107AC, 0x7F513B898F29FF08,
- 0xEA0BEDFF43E1F7B2, 0x2C6D94018CBAE6D0, 0x3A430D31BCD84672, 0x000D26892ECCFE83,
+ 0x283B34FAFEFDC8E4, 0x9208F44977C3E647, 0x7DEAE962816F4E9A, 0x68A2BA8AA262EC9D,
+ 0x8176F112EA43F45B, 0x02106D022634F504, 0x00007E8A50F02E37,
},
B: Fp{
- 0x1119D62AEA3007A1, 0xE3702AA4E04BAE1B, 0x9AB96F7D59F990E7, 0xF58440E8B43319C0,
- 0xAF8134BEE1489775, 0xE7F7774E905192AA, 0xF54AE09308E98039, 0x001EF7A041A86112,
+ 0xB378B7C1DA22CCB1, 0x6D089C99AD1D9230, 0xEBE15711813E2369, 0x2B35A68239D48A53,
+ 0x445F6FD138407C93, 0xBEF93B29A3F6B54B, 0x000173FA910377D3,
},
},
// Size of secret key for 3-torsion group, corresponds to log_2(3^e3) - 1.
- SecretBitLen: 252,
+ SecretBitLen: 217,
// SecretBitLen in bytes.
- SecretByteLen: uint((252 + 7) / 8),
+ SecretByteLen: 28,
// 3-torsion group computation strategy
IsogenyStrategy: []uint32{
- 0x47, 0x26, 0x15, 0x0D, 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02,
- 0x01, 0x01, 0x02, 0x01, 0x01, 0x05, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x02,
- 0x01, 0x01, 0x01, 0x09, 0x05, 0x03, 0x02, 0x01, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01,
- 0x01, 0x04, 0x02, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x11, 0x09, 0x05, 0x03, 0x02,
- 0x01, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x01, 0x02,
- 0x01, 0x01, 0x08, 0x04, 0x02, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01,
- 0x01, 0x02, 0x01, 0x01, 0x21, 0x11, 0x09, 0x05, 0x03, 0x02, 0x01, 0x01, 0x01, 0x01,
- 0x02, 0x01, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x08, 0x04,
- 0x02, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01,
- 0x10, 0x08, 0x04, 0x02, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01,
- 0x02, 0x01, 0x01, 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01,
- 0x01, 0x02, 0x01, 0x01},
+ 0x42, 0x21, 0x11, 0x09, 0x05, 0x03, 0x02, 0x01, 0x01, 0x01,
+ 0x01, 0x02, 0x01, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x01,
+ 0x02, 0x01, 0x01, 0x08, 0x04, 0x02, 0x01, 0x01, 0x01, 0x02,
+ 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x10,
+ 0x08, 0x04, 0x02, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04,
+ 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x08, 0x04, 0x02, 0x01,
+ 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01,
+ 0x01, 0x20, 0x10, 0x08, 0x04, 0x03, 0x01, 0x01, 0x01, 0x01,
+ 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01,
+ 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02,
+ 0x01, 0x01, 0x02, 0x01, 0x01, 0x10, 0x08, 0x04, 0x02, 0x01,
+ 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01,
+ 0x01, 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04,
+ 0x02, 0x01, 0x01, 0x02, 0x01, 0x01},
},
- OneFp2: P503_OneFp2,
- HalfFp2: P503_HalfFp2,
- MsgLen: 24,
- // SIKEp503 provides 128 bit of classical security ([SIKE], 5.1)
+ OneFp2: one,
+ HalfFp2: half,
+ MsgLen: 16,
+ // SIKEp434 provides 128 bit of classical security ([SIKE], 5.1)
KemSize: 16,
- // ceil(503+7/8)
- Bytelen: 63,
+ // ceil(434+7/8)
+ Bytelen: 55,
+ CiphertextSize: 16 + 330,
}
}
diff --git a/ssl/test/runner/sike/curve.go b/ssl/test/runner/sike/curve.go
index 69febaf..8172546 100644
--- a/ssl/test/runner/sike/curve.go
+++ b/ssl/test/runner/sike/curve.go
@@ -72,9 +72,9 @@
var aRR FpX2
// convert to montgomery domain
- fpMul(&aRR, &x.A, &p503R2) // = a*R*R
- fpMontRdc(&x.A, &aRR) // = a*R mod p
- fpMul(&aRR, &x.B, &p503R2)
+ fpMul(&aRR, &x.A, &R2) // = a*R*R
+ fpMontRdc(&x.A, &aRR) // = a*R mod p
+ fpMul(&aRR, &x.B, &R2)
fpMontRdc(&x.B, &aRR)
}
diff --git a/ssl/test/runner/sike/sike.go b/ssl/test/runner/sike/sike.go
index fc6de05..dcd6cfc 100644
--- a/ssl/test/runner/sike/sike.go
+++ b/ssl/test/runner/sike/sike.go
@@ -15,27 +15,12 @@
package sike
import (
- "crypto/hmac"
"crypto/sha256"
"crypto/subtle"
"errors"
"io"
)
-// Constants used for cSHAKE customization
-// Those values are different than in [SIKE] - they are encoded on 16bits. This is
-// done in order for implementation to be compatible with [REF] and test vectors.
-var G = []byte{0x00, 0x00}
-var H = []byte{0x01, 0x00}
-var F = []byte{0x02, 0x00}
-
-// Generates HMAC-SHA256 sum
-func hashMac(out, in, S []byte) {
- h := hmac.New(sha256.New, in)
- h.Write(S)
- copy(out, h.Sum(nil))
-}
-
// Zeroize Fp2
func zeroize(fp *Fp2) {
// Zeroizing in 2 separated loops tells compiler to
@@ -236,9 +221,8 @@
// Generate a public key in the 2-torsion group
func publicKeyGenA(prv *PrivateKey) (pub *PublicKey) {
var xPA, xQA, xRA ProjectivePoint
- var xPB, xQB, xRB, xR ProjectivePoint
+ var xPB, xQB, xRB, xK ProjectivePoint
var invZP, invZQ, invZR Fp2
- var tmp ProjectiveCurveParameters
pub = NewPublicKey(KeyVariant_SIDH_A)
var phi = NewIsogeny4()
@@ -254,16 +238,11 @@
xPB = ProjectivePoint{X: prv.params.B.Affine_P, Z: prv.params.OneFp2}
// Find isogeny kernel
- tmp.C = pub.params.OneFp2
- xR = ScalarMul3Pt(&tmp, &xPA, &xQA, &xRA, prv.params.A.SecretBitLen, prv.Scalar)
-
- // Reset params object and travers isogeny tree
- tmp.C = pub.params.OneFp2
- zeroize(&tmp.A)
- traverseTreePublicKeyA(&tmp, &xR, &xPB, &xQB, &xRB, pub)
+ xK = ScalarMul3Pt(&pub.params.InitCurve, &xPA, &xQA, &xRA, prv.params.A.SecretBitLen, prv.Scalar)
+ traverseTreePublicKeyA(&pub.params.InitCurve, &xK, &xPB, &xQB, &xRB, pub)
// Secret isogeny
- phi.GenerateCurve(&xR)
+ phi.GenerateCurve(&xK)
xPA = phi.EvaluatePoint(&xPB)
xQA = phi.EvaluatePoint(&xQB)
xRA = phi.EvaluatePoint(&xRB)
@@ -277,10 +256,9 @@
// Generate a public key in the 3-torsion group
func publicKeyGenB(prv *PrivateKey) (pub *PublicKey) {
- var xPB, xQB, xRB, xR ProjectivePoint
+ var xPB, xQB, xRB, xK ProjectivePoint
var xPA, xQA, xRA ProjectivePoint
var invZP, invZQ, invZR Fp2
- var tmp ProjectiveCurveParameters
pub = NewPublicKey(prv.keyVariant)
var phi = NewIsogeny3()
@@ -295,14 +273,10 @@
xQA = ProjectivePoint{X: prv.params.A.Affine_Q, Z: prv.params.OneFp2}
xRA = ProjectivePoint{X: prv.params.A.Affine_R, Z: prv.params.OneFp2}
- tmp.C = pub.params.OneFp2
- xR = ScalarMul3Pt(&tmp, &xPB, &xQB, &xRB, prv.params.B.SecretBitLen, prv.Scalar)
+ xK = ScalarMul3Pt(&pub.params.InitCurve, &xPB, &xQB, &xRB, prv.params.B.SecretBitLen, prv.Scalar)
+ traverseTreePublicKeyB(&pub.params.InitCurve, &xK, &xPA, &xQA, &xRA, pub)
- tmp.C = pub.params.OneFp2
- zeroize(&tmp.A)
- traverseTreePublicKeyB(&tmp, &xR, &xPA, &xQA, &xRA, pub)
-
- phi.GenerateCurve(&xR)
+ phi.GenerateCurve(&xK)
xPB = phi.EvaluatePoint(&xPA)
xQB = phi.EvaluatePoint(&xQA)
xRB = phi.EvaluatePoint(&xRA)
@@ -321,27 +295,28 @@
// Establishing shared keys in in 2-torsion group
func deriveSecretA(prv *PrivateKey, pub *PublicKey) []byte {
var sharedSecret = make([]byte, pub.params.SharedSecretSize)
- var cparam ProjectiveCurveParameters
var xP, xQ, xQmP ProjectivePoint
- var xR ProjectivePoint
+ var xK ProjectivePoint
+ var cparam ProjectiveCurveParameters
var phi = NewIsogeny4()
var jInv Fp2
// Recover curve coefficients
- cparam.C = pub.params.OneFp2
RecoverCoordinateA(&cparam, &pub.affine_xP, &pub.affine_xQ, &pub.affine_xQmP)
+ // C=1
+ cparam.C = Params.OneFp2
// Find kernel of the morphism
xP = ProjectivePoint{X: pub.affine_xP, Z: pub.params.OneFp2}
xQ = ProjectivePoint{X: pub.affine_xQ, Z: pub.params.OneFp2}
xQmP = ProjectivePoint{X: pub.affine_xQmP, Z: pub.params.OneFp2}
- xR = ScalarMul3Pt(&cparam, &xP, &xQ, &xQmP, pub.params.A.SecretBitLen, prv.Scalar)
+ xK = ScalarMul3Pt(&cparam, &xP, &xQ, &xQmP, pub.params.A.SecretBitLen, prv.Scalar)
// Traverse isogeny tree
- traverseTreeSharedKeyA(&cparam, &xR, pub)
+ traverseTreeSharedKeyA(&cparam, &xK, pub)
// Calculate j-invariant on isogeneus curve
- c := phi.GenerateCurve(&xR)
+ c := phi.GenerateCurve(&xK)
RecoverCurveCoefficients4(&cparam, &c)
Jinvariant(&cparam, &jInv)
convFp2ToBytes(sharedSecret, &jInv)
@@ -352,26 +327,27 @@
func deriveSecretB(prv *PrivateKey, pub *PublicKey) []byte {
var sharedSecret = make([]byte, pub.params.SharedSecretSize)
var xP, xQ, xQmP ProjectivePoint
- var xR ProjectivePoint
+ var xK ProjectivePoint
var cparam ProjectiveCurveParameters
var phi = NewIsogeny3()
var jInv Fp2
- // Recover curve coefficients
- cparam.C = pub.params.OneFp2
+ // Recover curve A coefficient
RecoverCoordinateA(&cparam, &pub.affine_xP, &pub.affine_xQ, &pub.affine_xQmP)
+ // C=1
+ cparam.C = Params.OneFp2
// Find kernel of the morphism
xP = ProjectivePoint{X: pub.affine_xP, Z: pub.params.OneFp2}
xQ = ProjectivePoint{X: pub.affine_xQ, Z: pub.params.OneFp2}
xQmP = ProjectivePoint{X: pub.affine_xQmP, Z: pub.params.OneFp2}
- xR = ScalarMul3Pt(&cparam, &xP, &xQ, &xQmP, pub.params.B.SecretBitLen, prv.Scalar)
+ xK = ScalarMul3Pt(&cparam, &xP, &xQ, &xQmP, pub.params.B.SecretBitLen, prv.Scalar)
// Traverse isogeny tree
- traverseTreeSharedKeyB(&cparam, &xR, pub)
+ traverseTreeSharedKeyB(&cparam, &xK, pub)
// Calculate j-invariant on isogeneus curve
- c := phi.GenerateCurve(&xR)
+ c := phi.GenerateCurve(&xK)
RecoverCurveCoefficients3(&cparam, &c)
Jinvariant(&cparam, &jInv)
convFp2ToBytes(sharedSecret, &jInv)
@@ -379,9 +355,6 @@
}
func encrypt(skA *PrivateKey, pkA, pkB *PublicKey, ptext []byte) ([]byte, error) {
- var n [40]byte // n can is max 320-bit (see 1.4 of [SIKE])
- var ptextLen = len(ptext)
-
if pkB.keyVariant != KeyVariant_SIKE {
return nil, errors.New("wrong key type")
}
@@ -391,14 +364,19 @@
return nil, err
}
- hashMac(n[:ptextLen], j, F)
- for i, _ := range ptext {
- n[i] ^= ptext[i]
+ if len(ptext) != pkA.params.KemSize {
+ panic("Implementation error")
}
- ret := make([]byte, pkA.Size()+ptextLen)
+ digest := sha256.Sum256(j)
+ // Uses truncated digest (first 16-bytes)
+ for i, _ := range ptext {
+ digest[i] ^= ptext[i]
+ }
+
+ ret := make([]byte, pkA.Size()+len(ptext))
copy(ret, pkA.Export())
- copy(ret[pkA.Size():], n[:ptextLen])
+ copy(ret[pkA.Size():], digest[:pkA.params.KemSize])
return ret, nil
}
@@ -565,7 +543,7 @@
func Encrypt(rng io.Reader, pub *PublicKey, ptext []byte) ([]byte, error) {
var ptextLen = len(ptext)
// c1 must be security level + 64 bits (see [SIKE] 1.4 and 4.3.3)
- if ptextLen != (pub.params.KemSize + 8) {
+ if ptextLen != pub.params.KemSize {
return nil, errors.New("Unsupported message length")
}
@@ -583,9 +561,9 @@
// decryption succeeds or error in case unexptected input was provided.
// Constant time
func Decrypt(prv *PrivateKey, ctext []byte) ([]byte, error) {
- var n [40]byte // n can is max 320-bit (see 1.4 of [SIKE])
var c1_len int
- var pk_len = prv.params.PublicKeySize
+ n := make([]byte, prv.params.KemSize)
+ pk_len := prv.params.PublicKeySize
if prv.keyVariant != KeyVariant_SIKE {
return nil, errors.New("wrong key type")
@@ -594,7 +572,7 @@
// ctext is a concatenation of (pubkey_A || c1=ciphertext)
// it must be security level + 64 bits (see [SIKE] 1.4 and 4.3.3)
c1_len = len(ctext) - pk_len
- if c1_len != (int(prv.params.KemSize) + 8) {
+ if c1_len != int(prv.params.KemSize) {
return nil, errors.New("wrong size of cipher text")
}
@@ -608,8 +586,10 @@
return nil, err
}
- hashMac(n[:c1_len], j, F)
- for i, _ := range n[:c1_len] {
+ digest := sha256.Sum256(j)
+ copy(n, digest[:])
+
+ for i, _ := range n {
n[i] ^= ctext[pk_len+i]
}
return n[:c1_len], nil
@@ -621,11 +601,9 @@
// Error is returned in case PRNG fails or wrongly formatted input was provided.
func Encapsulate(rng io.Reader, pub *PublicKey) (ctext []byte, secret []byte, err error) {
// Buffer for random, secret message
- var ptext = make([]byte, pub.params.MsgLen)
- // r = G(ptext||pub)
- var r = make([]byte, pub.params.A.SecretByteLen)
- // Resulting shared secret
- secret = make([]byte, pub.params.KemSize)
+ ptext := make([]byte, pub.params.MsgLen)
+ // SHA256 hash context object
+ d := sha256.New()
// Generate ephemeral value
_, err = io.ReadFull(rng, ptext)
@@ -633,13 +611,12 @@
return nil, nil, err
}
- // must be big enough to store ptext+c0+c1
- var hmac_key = make([]byte, pub.Size()+2*Params.MsgLen)
- copy(hmac_key, ptext)
- copy(hmac_key[len(ptext):], pub.Export())
- hashMac(r, hmac_key[:len(ptext)+pub.Size()], G)
- // Ensure bitlength is not bigger than to 2^e2-1
- r[len(r)-1] &= (1 << (pub.params.A.SecretBitLen % 8)) - 1
+ // Implementation uses first 28-bytes of secret
+ d.Write(ptext)
+ d.Write(pub.Export())
+ digest := d.Sum(nil)
+ // r = G(ptext||pub)
+ r := digest[:pub.params.A.SecretByteLen]
// (c0 || c1) = Enc(pkA, ptext; r)
skA := NewPrivateKey(KeyVariant_SIDH_A)
@@ -655,10 +632,11 @@
}
// K = H(ptext||(c0||c1))
- copy(hmac_key, ptext)
- copy(hmac_key[len(ptext):], ctext)
- hashMac(secret, hmac_key[:len(ptext)+len(ctext)], H)
- return ctext, secret, nil
+ d.Reset()
+ d.Write(ptext)
+ d.Write(ctext)
+ digest = d.Sum(digest[:0])
+ return ctext, digest[:pub.params.KemSize], nil
}
// Decapsulate given the keypair and ciphertext as inputs, Decapsulate outputs a shared
@@ -666,10 +644,9 @@
// Decapsulation may fail in case input is wrongly formatted.
// Constant time for properly initialized input.
func Decapsulate(prv *PrivateKey, pub *PublicKey, ctext []byte) ([]byte, error) {
- var r = make([]byte, pub.params.A.SecretByteLen)
- // Resulting shared secret
- var secret = make([]byte, pub.params.KemSize)
var skA = NewPrivateKey(KeyVariant_SIDH_A)
+ // SHA256 hash context object
+ d := sha256.New()
m, err := Decrypt(prv, ctext)
if err != nil {
@@ -677,33 +654,30 @@
}
// r' = G(m'||pub)
- var hmac_key = make([]byte, pub.Size()+2*Params.MsgLen)
- copy(hmac_key, m)
- copy(hmac_key[len(m):], pub.Export())
- hashMac(r, hmac_key[:len(m)+pub.Size()], G)
- // Ensure bitlength is not bigger than 2^e2-1
- r[len(r)-1] &= (1 << (pub.params.A.SecretBitLen % 8)) - 1
-
+ d.Write(m)
+ d.Write(pub.Export())
+ digest := d.Sum(nil)
// Never fails
- skA.Import(r)
+ skA.Import(digest[:pub.params.A.SecretByteLen])
// Never fails
pkA := skA.GeneratePublicKey()
c0 := pkA.Export()
+ d.Reset()
if subtle.ConstantTimeCompare(c0, ctext[:len(c0)]) == 1 {
- copy(hmac_key, m)
+ d.Write(m)
} else {
- // S is chosen at random when generating a key and unknown to other party. It
+ // S is chosen at random when generating a key and is unknown to the other party. It
// may seem weird, but it's correct. It is important that S is unpredictable
// to other party. Without this check, it is possible to recover a secret, by
// providing series of invalid ciphertexts. It is also important that in case
//
// See more details in "On the security of supersingular isogeny cryptosystems"
// (S. Galbraith, et al., 2016, ePrint #859).
- copy(hmac_key, prv.S)
+ d.Write(prv.S)
}
- copy(hmac_key[len(m):], ctext)
- hashMac(secret, hmac_key[:len(m)+len(ctext)], H)
- return secret, nil
+ d.Write(ctext)
+ digest = d.Sum(digest[:0])
+ return digest[:pub.params.KemSize], nil
}
diff --git a/ssl/test/runner/sike/sike_test.go b/ssl/test/runner/sike/sike_test.go
index 2813504..2e146bc 100644
--- a/ssl/test/runner/sike/sike_test.go
+++ b/ssl/test/runner/sike/sike_test.go
@@ -28,41 +28,55 @@
name string
PrB_sidh string
PkB_sidh string
+ PrA_sidh string
+ PkA_sidh string
PkB_sike string
PrB_sike string
- PrA_sike string
- PkA_sike string
}{
- name: "P-503",
- PkB_sike: "68460C22466E95864CFEA7B5D9077E768FF4F9ED69AE56D7CF3F236FB06B31020EEE34B5B572CEA5DDF20B531966AA8F5F3ACC0C6D1CE04EEDC30FD1F1233E2D96FE60C6D638FC646EAF2E2246F1AEC96859CE874A1F029A78F9C978CD6B22114A0D5AB20101191FD923E80C76908B1498B9D0200065CCA09159A0C65A1E346CC6470314FE78388DAA89DD08EC67DBE63C1F606674ACC49EBF9FDBB2B898B3CE733113AA6F942DB401A76D629CE6EE6C0FDAF4CFB1A5E366DB66C17B3923A1B7FB26A3FF25B9018869C674D3DEF4AF269901D686FE4647F9D2CDB2CEB3AFA305B27C885F037ED167F595066C21E7DD467D8332B934A5102DA5F13332DFA356B82156A0BB2E7E91C6B85B7D1E381BC9E3F0FC4DB9C36016D9ECEC415D7E977E9AC29910D934BA2FE4EE49D3B387607A4E1AFABF495FB86A77194626589E802FF5167C7A25C542C1EAD25A6E0AA931D94F2F9AFD3DBDF222E651F729A90E77B20974905F1E65E041CE6C95AAB3E1F22D332E0A5DE9C5DB3D9C7A38",
- PrB_sike: "80FC55DA74DEFE3113487B80841E678AF9ED4E0599CF07353A4AB93971C090A0" +
- "A9402C9DC98AC6DC8F5FDE5E970AE22BA48A400EFC72851C",
- PrB_sidh: "A885A8B889520A6DBAD9FB33365E5B77FDED629440A16A533F259A510F63A822",
- PrA_sike: "B0AD510708F4ABCF3E0D97DC2F2FF112D9D2AAE49D97FFD1E4267F21C6E71C03",
- PkA_sike: "A6BADBA04518A924B20046B59AC197DCDF0EA48014C9E228C4994CCA432F360E" +
- "2D527AFB06CA7C96EE5CEE19BAD53BF9218A3961CAD7EC092BD8D9EBB22A3D51" +
- "33008895A3F1F6A023F91E0FE06A00A622FD6335DAC107F8EC4283DC2632F080" +
- "4E64B390DAD8A2572F1947C67FDF4F8787D140CE2C6B24E752DA9A195040EDFA" +
- "C27333FAE97DBDEB41DA9EEB2DB067AE7DA8C58C0EF57AEFC18A3D6BD0576FF2" +
- "F1CFCAEC50C958331BF631F3D2E769790C7B6DF282B74BBC02998AD10F291D47" +
- "C5A762FF84253D3B3278BDF20C8D4D4AA317BE401B884E26A1F02C7308AADB68" +
- "20EBDB0D339F5A63346F3B40CACED72F544DAF51566C6E807D0E6E1E38514342" +
- "432661DC9564DA07548570E256688CD9E8060D8775F95D501886D958588CACA0" +
- "9F2D2AE1913F996E76AF63E31A179A7A7D2A46EDA03B2BCCF9020A5AA15F9A28" +
- "9340B33F3AE7F97360D45F8AE1B9DD48779A57E8C45B50A02C00349CD1C58C55" +
- "1D68BC2A75EAFED944E8C599C288037181E997471352E24C952B",
- PkB_sidh: "244AF1F367C2C33912750A98497CC8214BC195BD52BD76513D32ACE4B75E31F0" +
- "281755C265F5565C74E3C04182B9C244071859C8588CC7F09547CEFF8F7705D2" +
- "60CE87D6BFF914EE7DBE4B9AF051CA420062EEBDF043AF58184495026949B068" +
- "98A47046BFAE8DF3B447746184AF550553BB5D266D6E1967ACA33CAC5F399F90" +
- "360D70867F2C71EF6F94FF915C7DA8BC9549FB7656E691DAEFC93CF56876E482" +
- "CA2F8BE2D6CDCC374C31AD8833CABE997CC92305F38497BEC4DFD1821B004FEC" +
- "E16448F9A24F965EFE409A8939EEA671633D9FFCF961283E59B8834BDF7EDDB3" +
- "05D6275B61DA6692325432A0BAA074FC7C1F51E76208AB193A57520D40A76334" +
- "EE5712BDC3E1EFB6103966F2329EDFF63082C4DFCDF6BE1C5A048630B81871B8" +
- "83B735748A8FD4E2D9530C272163AB18105B10015CA7456202FE1C9B92CEB167" +
- "5EAE1132E582C88E47ED87B363D45F05BEA714D5E9933D7AF4071CBB5D49008F" +
- "3E3DAD7DFF935EE509D5DE561842B678CCEB133D62E270E9AC3E",
+ name: "P-434",
+ PrA_sidh: "3A727E04EA9B7E2A766A6F846489E7E7B915263BCEED308BB10FC9",
+ PkA_sidh: "9E668D1E6750ED4B91EE052C32839CA9DD2E56D52BC24DECC950AA" +
+ "AD24CEED3F9049C77FE80F0B9B01E7F8DAD7833EEC2286544D6380" +
+ "009C379CDD3E7517CEF5E20EB01F8231D52FC30DC61D2F63FB357F" +
+ "85DC6396E8A95DB9740BD3A972C8DB7901B31F074CD3E45345CA78" +
+ "F900817130E688A29A7CF0073B5C00FF2C65FBE776918EF9BD8E75" +
+ "B29EF7FAB791969B60B0C5B37A8992EDEF95FA7BAC40A95DAFE02E" +
+ "237301FEE9A7A43FD0B73477E8035DD12B73FAFEF18D39904DDE36" +
+ "53A754F36BE1888F6607C6A7951349A414352CF31A29F2C40302DB" +
+ "406C48018C905EB9DC46AFBF42A9187A9BB9E51B587622A2862DC7" +
+ "D5CC598BF38ED6320FB51D8697AD3D7A72ABCC32A393F0133DA8DF" +
+ "5E253D9E00B760B2DF342FCE974DCFE946CFE4727783531882800F" +
+ "9E5DD594D6D5A6275EEFEF9713ED838F4A06BB34D7B8D46E0B385A" +
+ "AEA1C7963601",
+ PrB_sidh: "E37BFE55B43B32448F375903D8D226EC94ADBFEA1D2B3536EB987001",
+ PkB_sidh: "C9F73E4497AAA3FDF9EB688135866A8A83934BA10E273B8CC3808C" +
+ "F0C1F5FAB3E9BB295885881B73DEBC875670C0F51C4BB40DF5FEDE" +
+ "01B8AF32D1BF10508B8C17B2734EB93B2B7F5D84A4A0F2F816E9E2" +
+ "C32AC253C0B6025B124D05A87A9E2A8567930F44BAA14219B941B6" +
+ "B400B4AED1D796DA12A5A9F0B8F3F5EE9DD43F64CB24A3B1719DF2" +
+ "78ADF56B5F3395187829DA2319DEABF6BBD6EDA244DE2B62CC5AC2" +
+ "50C1009DD1CD4712B0B37406612AD002B5E51A62B51AC9C0374D14" +
+ "3ABBBD58275FAFC4A5E959C54838C2D6D9FB43B7B2609061267B6A" +
+ "2E6C6D01D295C4223E0D3D7A4CDCFB28A7818A737935279751A6DD" +
+ "8290FD498D1F6AD5F4FFF6BDFA536713F509DCE8047252F1E7D0DD" +
+ "9FCC414C0070B5DCCE3665A21A032D7FBE749181032183AFAD240B" +
+ "7E671E87FBBEC3A8CA4C11AA7A9A23AC69AE2ACF54B664DECD2775" +
+ "3D63508F1B02",
+ PrB_sike: "4B622DE1350119C45A9F2E2EF3DC5DF56A27FCDFCDDAF58CD69B90" +
+ "3752D68C200934E160B234E49EDE247601",
+ PkB_sike: "1BD0A2E81307B6F96461317DDF535ACC0E59C742627BAE60D27605" +
+ "E10FAF722D22A73E184CB572A12E79DCD58C6B54FB01442114CBE9" +
+ "010B6CAEC25D04C16C5E42540C1524C545B8C67614ED4183C9FA5B" +
+ "D0BE45A7F89FBC770EE8E7E5E391C7EE6F35F74C29E6D9E35B1663" +
+ "DA01E48E9DEB2347512D366FDE505161677055E3EF23054D276E81" +
+ "7E2C57025DA1C10D2461F68617F2D11256EEE4E2D7DBDF6C8E34F3" +
+ "A0FD00C625428CB41857002159DAB94267ABE42D630C6AAA91AF83" +
+ "7C7A6740754EA6634C45454C51B0BB4D44C3CCCCE4B32C00901CF6" +
+ "9C008D013348379B2F9837F428A01B6173584691F2A6F3A3C4CF48" +
+ "7D20D261B36C8CDB1BC158E2A5162A9DA4F7A97AA0879B9897E2B6" +
+ "891B672201F9AEFBF799C27B2587120AC586A511360926FB7DA8EB" +
+ "F5CB5272F396AE06608422BE9792E2CE9BEF21BF55B7EFF8DC7EC8" +
+ "C99910D3F800",
}
/* -------------------------------------------------------------------------
@@ -70,6 +84,7 @@
-------------------------------------------------------------------------*/
// Fail if err !=nil. Display msg as an error message
func checkErr(t testing.TB, err error, msg string) {
+ t.Helper()
if err != nil {
t.Error(msg)
}
@@ -110,9 +125,9 @@
Unit tests
-------------------------------------------------------------------------*/
func TestKeygen(t *testing.T) {
- alicePrivate := convToPrv(tdata.PrA_sike, KeyVariant_SIDH_A)
+ alicePrivate := convToPrv(tdata.PrA_sidh, KeyVariant_SIDH_A)
bobPrivate := convToPrv(tdata.PrB_sidh, KeyVariant_SIDH_B)
- expPubA := convToPub(tdata.PkA_sike, KeyVariant_SIDH_A)
+ expPubA := convToPub(tdata.PkA_sidh, KeyVariant_SIDH_A)
expPubB := convToPub(tdata.PkB_sidh, KeyVariant_SIDH_B)
pubA := alicePrivate.GeneratePublicKey()
@@ -132,7 +147,7 @@
b := NewPublicKey(KeyVariant_SIDH_B)
// Import keys
- a_hex, err := hex.DecodeString(tdata.PkA_sike)
+ a_hex, err := hex.DecodeString(tdata.PkA_sidh)
checkErr(t, err, "invalid hex-number provided")
err = a.Import(a_hex)
@@ -210,7 +225,7 @@
}
// Negative case
- dec, e := hex.DecodeString(tdata.PkA_sike)
+ dec, e := hex.DecodeString(tdata.PkA_sidh)
if e != nil {
t.FailNow()
}
@@ -387,6 +402,7 @@
// calculated shared secret
ct, ss_e, err := Encapsulate(rand.Reader, pk)
+
checkErr(t, err, "encapsulation failed")
ss_d, err := Decapsulate(sk, pk, ct)
checkErr(t, err, "decapsulation failed")
@@ -512,115 +528,106 @@
}
func TestKeyAgreement(t *testing.T) {
- testKeyAgreement(t, tdata.PkA_sike, tdata.PrA_sike, tdata.PkB_sidh, tdata.PrB_sidh)
+ testKeyAgreement(t, tdata.PkA_sidh, tdata.PrA_sidh, tdata.PkB_sidh, tdata.PrB_sidh)
}
// Same values as in sike_test.cc
func TestDecapsulation(t *testing.T) {
-
- var sk = [56]byte{
- 0xDB, 0xAF, 0x2C, 0x89, 0xCA, 0x5A, 0xD4, 0x9D, 0x4F, 0x13,
- 0x40, 0xDF, 0x2D, 0xB1, 0x5F, 0x4C, 0x91, 0xA7, 0x1F, 0x0B,
- 0x29, 0x15, 0x01, 0x59, 0xBC, 0x5F, 0x0B, 0x4A, 0x03, 0x27,
- 0x6F, 0x18}
-
- var pk = []byte{
- 0x07, 0xAA, 0x51, 0x45, 0x3E, 0x1F, 0x53, 0x2A, 0x0A, 0x05,
- 0x46, 0xF6, 0x54, 0x7F, 0x5D, 0x56, 0xD6, 0x76, 0xD3, 0xEA,
- 0x4B, 0x6B, 0x01, 0x9B, 0x11, 0x72, 0x6F, 0x75, 0xEA, 0x34,
- 0x3C, 0x28, 0x2C, 0x36, 0xFD, 0x77, 0xDA, 0xBE, 0xB6, 0x20,
- 0x18, 0xC1, 0x93, 0x98, 0x18, 0x86, 0x30, 0x2F, 0x2E, 0xD2,
- 0x00, 0x61, 0xFF, 0xAE, 0x78, 0xAE, 0xFB, 0x6F, 0x32, 0xAC,
- 0x06, 0xBF, 0x35, 0xF6, 0xF7, 0x5B, 0x98, 0x26, 0x95, 0xC2,
- 0xD8, 0xD6, 0x1C, 0x0E, 0x47, 0xDA, 0x76, 0xCE, 0xB5, 0xF1,
- 0x19, 0xCC, 0x01, 0xE1, 0x17, 0xA9, 0x62, 0xF7, 0x82, 0x6C,
- 0x25, 0x51, 0x25, 0xAE, 0xFE, 0xE3, 0xE2, 0xE1, 0x35, 0xAE,
- 0x2E, 0x8F, 0x38, 0xE0, 0x7C, 0x74, 0x3C, 0x1D, 0x39, 0x91,
- 0x1B, 0xC7, 0x9F, 0x8E, 0x33, 0x4E, 0x84, 0x19, 0xB8, 0xD9,
- 0xC2, 0x71, 0x35, 0x02, 0x47, 0x3E, 0x79, 0xEF, 0x47, 0xE1,
- 0xD8, 0x21, 0x96, 0x1F, 0x11, 0x59, 0x39, 0x34, 0x76, 0xEF,
- 0x3E, 0xB7, 0x4E, 0xFB, 0x7C, 0x55, 0xA1, 0x85, 0xAA, 0xAB,
- 0xAD, 0xF0, 0x09, 0xCB, 0xD1, 0xE3, 0x7C, 0x4F, 0x5D, 0x2D,
- 0xE1, 0x13, 0xF0, 0x71, 0xD9, 0xE5, 0xF6, 0xAF, 0x7F, 0xC1,
- 0x27, 0x95, 0x8D, 0x52, 0xD5, 0x96, 0x42, 0x38, 0x41, 0xF7,
- 0x24, 0x3F, 0x3A, 0xB5, 0x7E, 0x11, 0xE4, 0xF9, 0x33, 0xEE,
- 0x4D, 0xBE, 0x74, 0x48, 0xF9, 0x98, 0x04, 0x01, 0x16, 0xEB,
- 0xA9, 0x0D, 0x61, 0xC6, 0xFD, 0x4C, 0xCF, 0x98, 0x84, 0x4A,
- 0x94, 0xAC, 0x69, 0x2C, 0x02, 0x8B, 0xE3, 0xD1, 0x41, 0x0D,
- 0xF2, 0x2D, 0x46, 0x1F, 0x57, 0x1C, 0x77, 0x86, 0x18, 0xE3,
- 0x63, 0xDE, 0xF3, 0xE3, 0x02, 0x30, 0x54, 0x73, 0xAE, 0xC2,
- 0x32, 0xA2, 0xCE, 0xEB, 0xCF, 0x81, 0x46, 0x54, 0x5C, 0xF4,
- 0x5D, 0x2A, 0x03, 0x5D, 0x9C, 0xAE, 0xE0, 0x60, 0x03, 0x80,
- 0x11, 0x30, 0xA5, 0xAA, 0xD1, 0x75, 0x67, 0xE0, 0x1C, 0x2B,
- 0x6B, 0x5D, 0x83, 0xDE, 0x92, 0x9B, 0x0E, 0xD7, 0x11, 0x0F,
- 0x00, 0xC4, 0x59, 0xE4, 0x81, 0x04, 0x3B, 0xEE, 0x5C, 0x04,
- 0xD1, 0x0E, 0xD0, 0x67, 0xF5, 0xCC, 0xAA, 0x72, 0x73, 0xEA,
- 0xC4, 0x76, 0x99, 0x3B, 0x4C, 0x90, 0x2F, 0xCB, 0xD8, 0x0A,
- 0x5B, 0xEC, 0x0E, 0x0E, 0x1F, 0x59, 0xEA, 0x14, 0x8D, 0x34,
- 0x53, 0x65, 0x4C, 0x1A, 0x59, 0xA8, 0x95, 0x66, 0x60, 0xBB,
- 0xC4, 0xCC, 0x32, 0xA9, 0x8D, 0x2A, 0xAA, 0x14, 0x6F, 0x0F,
- 0x81, 0x4D, 0x32, 0x02, 0xFD, 0x33, 0x58, 0x42, 0xCF, 0xF3,
- 0x67, 0xD0, 0x9F, 0x0B, 0xB1, 0xCC, 0x18, 0xA5, 0xC4, 0x19,
- 0xB6, 0x00, 0xED, 0xFA, 0x32, 0x1A, 0x5F, 0x67, 0xC8, 0xC3,
- 0xEB, 0x0D, 0xB5, 0x9A, 0x36, 0x47, 0x82, 0x00,
+ var sk = [16 + 28]byte{
+ 0x04, 0x5E, 0x01, 0x42, 0xB8, 0x2F, 0xE1, 0x9A, 0x38, 0x25,
+ 0x92, 0xE7, 0xDC, 0xBA, 0xF7, 0x1B, 0xB1, 0xFD, 0x34, 0x42,
+ 0xDB, 0x02, 0xBC, 0x9D, 0x4C, 0xD0, 0x72, 0x34, 0x4D, 0xBD,
+ 0x06, 0xDF, 0x1C, 0x7D, 0x0A, 0x88, 0xB2, 0x50, 0xC4, 0xF6,
+ 0xAE, 0xE8, 0x25, 0x01,
}
- var ct = []byte{
- 0xE6, 0xB7, 0xE5, 0x7B, 0xA9, 0x19, 0xD1, 0x2C, 0xB8, 0x5C,
- 0x7B, 0x66, 0x74, 0xB0, 0x71, 0xA1, 0xFF, 0x71, 0x7F, 0x4B,
- 0xB5, 0xA6, 0xAF, 0x48, 0x32, 0x52, 0xD5, 0x82, 0xEE, 0x8A,
- 0xBB, 0x08, 0x1E, 0xF6, 0xAC, 0x91, 0xA2, 0xCB, 0x6B, 0x6A,
- 0x09, 0x2B, 0xD9, 0xC6, 0x27, 0xD6, 0x3A, 0x6B, 0x8D, 0xFC,
- 0xB8, 0x90, 0x8F, 0x72, 0xB3, 0xFA, 0x7D, 0x34, 0x7A, 0xC4,
- 0x7E, 0xE3, 0x30, 0xC5, 0xA0, 0xFE, 0x3D, 0x43, 0x14, 0x4E,
- 0x3A, 0x14, 0x76, 0x3E, 0xFB, 0xDF, 0xE3, 0xA8, 0xE3, 0x5E,
- 0x38, 0xF2, 0xE0, 0x39, 0x67, 0x60, 0xFD, 0xFB, 0xB4, 0x19,
- 0xCD, 0xE1, 0x93, 0xA2, 0x06, 0xCC, 0x65, 0xCD, 0x6E, 0xC8,
- 0xB4, 0x5E, 0x41, 0x4B, 0x6C, 0xA5, 0xF4, 0xE4, 0x9D, 0x52,
- 0x8C, 0x25, 0x60, 0xDD, 0x3D, 0xA9, 0x7F, 0xF2, 0x88, 0xC1,
- 0x0C, 0xEE, 0x97, 0xE0, 0xE7, 0x3B, 0xB7, 0xD3, 0x6F, 0x28,
- 0x79, 0x2F, 0x50, 0xB2, 0x4F, 0x74, 0x3A, 0x0C, 0x88, 0x27,
- 0x98, 0x3A, 0x27, 0xD3, 0x26, 0x83, 0x59, 0x49, 0x81, 0x5B,
- 0x0D, 0xA7, 0x0C, 0x4F, 0xEF, 0xFB, 0x1E, 0xAF, 0xE9, 0xD2,
- 0x1C, 0x10, 0x25, 0xEC, 0x9E, 0xFA, 0x57, 0x36, 0xAA, 0x3F,
- 0xC1, 0xA3, 0x2C, 0xE9, 0xB5, 0xC9, 0xED, 0x72, 0x51, 0x4C,
- 0x02, 0xB4, 0x7B, 0xB3, 0xED, 0x9F, 0x45, 0x03, 0x34, 0xAC,
- 0x9A, 0x9E, 0x62, 0x5F, 0x82, 0x7A, 0x77, 0x34, 0xF9, 0x21,
- 0x94, 0xD2, 0x38, 0x3D, 0x05, 0xF0, 0x8A, 0x60, 0x1C, 0xB7,
- 0x1D, 0xF5, 0xB7, 0x53, 0x77, 0xD3, 0x9D, 0x3D, 0x70, 0x6A,
- 0xCB, 0x18, 0x20, 0x6B, 0x29, 0x17, 0x3A, 0x6D, 0xA1, 0xB2,
- 0x64, 0xDB, 0x6C, 0xE6, 0x1A, 0x95, 0xA7, 0xF4, 0x1A, 0x78,
- 0x1D, 0xA2, 0x40, 0x15, 0x41, 0x59, 0xDD, 0xEE, 0x23, 0x57,
- 0xCE, 0x36, 0x0D, 0x55, 0xBD, 0xB8, 0xFD, 0x0F, 0x35, 0xBD,
- 0x5B, 0x92, 0xD6, 0x1C, 0x84, 0x8C, 0x32, 0x64, 0xA6, 0x5C,
- 0x45, 0x18, 0x07, 0x6B, 0xF9, 0xA9, 0x43, 0x9A, 0x83, 0xCD,
- 0xB5, 0xB3, 0xD9, 0x17, 0x99, 0x2C, 0x2A, 0x8B, 0xE0, 0x8E,
- 0xAF, 0xA6, 0x4C, 0x95, 0xBB, 0x70, 0x60, 0x1A, 0x3A, 0x97,
- 0xAA, 0x2F, 0x3D, 0x22, 0x83, 0xB7, 0x4F, 0x59, 0xED, 0x3F,
- 0x4E, 0xF4, 0x19, 0xC6, 0x25, 0x0B, 0x0A, 0x5E, 0x21, 0xB9,
- 0x91, 0xB8, 0x19, 0x84, 0x48, 0x78, 0xCE, 0x27, 0xBF, 0x41,
- 0x89, 0xF6, 0x30, 0xFD, 0x6B, 0xD9, 0xB8, 0x1D, 0x72, 0x8A,
- 0x56, 0xCC, 0x2F, 0x82, 0xE4, 0x46, 0x4D, 0x75, 0xD8, 0x92,
- 0xE6, 0x9C, 0xCC, 0xD2, 0xCD, 0x35, 0xE4, 0xFC, 0x2A, 0x85,
- 0x6B, 0xA9, 0xB2, 0x27, 0xC9, 0xA1, 0xFF, 0xB3, 0x96, 0x3E,
- 0x59, 0xF6, 0x4C, 0x66, 0x56, 0x2E, 0xF5, 0x1B, 0x97, 0x32,
- 0xB0, 0x71, 0x5A, 0x9C, 0x50, 0x4B, 0x6F, 0xC4, 0xCA, 0x94,
- 0x75, 0x37, 0x46, 0x10, 0x12, 0x2F, 0x4F, 0xA3, 0x82, 0xCD,
- 0xBD, 0x7C,
+ var pk = [330]byte{
+ 0x6D, 0x8D, 0xF5, 0x7B, 0xCD, 0x47, 0xCA, 0xCB, 0x7A, 0x38,
+ 0xB7, 0xA6, 0x90, 0xB7, 0x37, 0x03, 0xD4, 0x6F, 0x27, 0x73,
+ 0x74, 0x17, 0x5A, 0xA4, 0x0D, 0xC6, 0x81, 0xAD, 0xDB, 0xF7,
+ 0x18, 0xB2, 0x3C, 0x30, 0xCF, 0xAA, 0x08, 0x11, 0x91, 0xCC,
+ 0x27, 0x4E, 0xF1, 0xA6, 0xB7, 0xDA, 0xD2, 0xCF, 0x99, 0x7F,
+ 0xF7, 0xE1, 0xD0, 0xCE, 0x00, 0xD2, 0x4B, 0xA4, 0x33, 0xB4,
+ 0x87, 0x01, 0x3F, 0x02, 0xF7, 0xF9, 0xDE, 0xC3, 0x60, 0x62,
+ 0xDA, 0x3F, 0x74, 0xA9, 0x44, 0xBE, 0x19, 0xD5, 0x03, 0x2A,
+ 0x79, 0x8C, 0xA7, 0xFF, 0xEA, 0xB3, 0xBB, 0xB5, 0xD4, 0x1D,
+ 0x8F, 0x92, 0xCE, 0x62, 0x6E, 0x99, 0x24, 0xD7, 0x57, 0xFA,
+ 0xCD, 0xB6, 0xE2, 0x8E, 0xFD, 0x22, 0x0E, 0x31, 0x21, 0x01,
+ 0x8D, 0x79, 0xF8, 0x3E, 0x27, 0xEC, 0x43, 0x40, 0xDB, 0x82,
+ 0xE5, 0xEB, 0x6C, 0x97, 0x66, 0x29, 0x15, 0x68, 0xB7, 0x4D,
+ 0x84, 0xD1, 0x8A, 0x0B, 0x12, 0x36, 0x2C, 0x0C, 0x0A, 0x6E,
+ 0x4E, 0xDE, 0xA5, 0x8A, 0xDE, 0x77, 0xDD, 0x70, 0x49, 0x73,
+ 0xAC, 0x27, 0x6D, 0x8D, 0x25, 0x9A, 0xE4, 0x25, 0xE8, 0x95,
+ 0x8F, 0xFE, 0x90, 0x3B, 0x00, 0x69, 0x20, 0xE8, 0x7C, 0xA5,
+ 0xF5, 0x79, 0xC0, 0x61, 0x51, 0x91, 0x35, 0x25, 0x3F, 0x17,
+ 0x2F, 0x70, 0x73, 0xF0, 0x89, 0xB5, 0xC8, 0x25, 0xB8, 0xE5,
+ 0x7E, 0x34, 0xDD, 0x11, 0xE5, 0xD6, 0xC3, 0xD5, 0x29, 0x89,
+ 0xC6, 0x2C, 0x99, 0x53, 0x1D, 0x2C, 0x77, 0xB0, 0xB6, 0xA1,
+ 0xBD, 0x79, 0xFB, 0x4A, 0xC2, 0x48, 0x4C, 0x62, 0x51, 0x00,
+ 0xE3, 0x91, 0x2A, 0xCB, 0x84, 0x03, 0x5D, 0x2D, 0xC8, 0x33,
+ 0xE9, 0x14, 0xBF, 0x74, 0x21, 0xBC, 0xF4, 0x76, 0xE5, 0x42,
+ 0xB8, 0xBD, 0xE2, 0xE7, 0x20, 0x95, 0x54, 0xF2, 0xED, 0xC0,
+ 0x79, 0x38, 0x1E, 0xD2, 0xEA, 0x1A, 0x63, 0x85, 0xE7, 0x3A,
+ 0xDA, 0xAD, 0xAB, 0x1B, 0x1E, 0x19, 0x9E, 0x73, 0xD0, 0x10,
+ 0x2E, 0x38, 0xAC, 0x8B, 0x00, 0x6A, 0x30, 0x2C, 0x3D, 0x70,
+ 0x8E, 0x39, 0x6D, 0xC0, 0x12, 0x61, 0x7D, 0x2A, 0x0A, 0x04,
+ 0x95, 0x8E, 0x09, 0x3C, 0x7B, 0xEC, 0x2E, 0xBC, 0xE8, 0xE8,
+ 0xE8, 0x37, 0x29, 0xC4, 0x7E, 0x76, 0x48, 0xB9, 0x3B, 0x72,
+ 0xE5, 0x99, 0x9B, 0xF9, 0xE3, 0x99, 0x72, 0x3F, 0x35, 0x29,
+ 0x85, 0xE0, 0xC8, 0xBF, 0xB1, 0x6B, 0xB1, 0x6E, 0x72, 0x00,
}
- var ss_exp = []byte{
- 0x74, 0x3D, 0x25, 0x36, 0x00, 0x24, 0x63, 0x1A, 0x39, 0x1A,
- 0xB4, 0xAD, 0x01, 0x17, 0x78, 0xE9}
+
+ var ct = [330 + 16]byte{
+ 0xFF, 0xEB, 0xEF, 0x4A, 0xC0, 0x57, 0x0F, 0x26, 0xAC, 0x76,
+ 0xA8, 0xB0, 0xA3, 0x5D, 0x9C, 0xD9, 0x25, 0xD1, 0x7F, 0x92,
+ 0x5D, 0xF4, 0x23, 0x34, 0xC3, 0x03, 0x10, 0xE1, 0xB0, 0x24,
+ 0x9B, 0x44, 0x58, 0x26, 0x13, 0x56, 0x83, 0x43, 0x72, 0x69,
+ 0x28, 0x0D, 0x55, 0x07, 0x1F, 0xDB, 0xC0, 0x23, 0x34, 0x83,
+ 0x1A, 0x09, 0x9B, 0x80, 0x00, 0x64, 0x56, 0xDC, 0x79, 0x7A,
+ 0xD2, 0xCE, 0x23, 0xC9, 0x72, 0x27, 0xFC, 0x8D, 0xAB, 0xBF,
+ 0xD3, 0x17, 0xF6, 0x91, 0x7B, 0x15, 0x93, 0x83, 0x8A, 0x4F,
+ 0x6C, 0xCA, 0x4A, 0x94, 0xDA, 0xC7, 0x9D, 0xB6, 0xD6, 0xBA,
+ 0xBD, 0x81, 0x9A, 0x78, 0xE5, 0xE5, 0xBE, 0x17, 0xBC, 0xCB,
+ 0xC8, 0x23, 0x80, 0x5F, 0x75, 0xF8, 0xDB, 0x51, 0x55, 0x00,
+ 0x25, 0x33, 0x52, 0x64, 0xB2, 0xD6, 0xD8, 0x9A, 0x2A, 0x9E,
+ 0x29, 0x99, 0x13, 0x33, 0xE2, 0xA7, 0x98, 0xAC, 0xD7, 0x79,
+ 0x5C, 0x2F, 0xBA, 0x07, 0xC3, 0x03, 0x37, 0xD6, 0xE6, 0xB5,
+ 0xA1, 0xF5, 0x29, 0xB6, 0xF6, 0xC0, 0x5C, 0x44, 0x68, 0x2B,
+ 0x0B, 0xF5, 0x00, 0x01, 0x44, 0xD5, 0xCC, 0x23, 0xB5, 0x27,
+ 0x4F, 0xCA, 0xB4, 0x05, 0x01, 0xF9, 0xD4, 0x41, 0xE0, 0xE1,
+ 0x1E, 0xCF, 0xA9, 0xBC, 0x79, 0xD7, 0xD5, 0xF5, 0x3C, 0xE6,
+ 0x93, 0xF4, 0x6C, 0x84, 0x5A, 0x2C, 0x4B, 0xE4, 0x91, 0xB2,
+ 0xB2, 0xB8, 0xAD, 0x74, 0x9A, 0x69, 0x79, 0x4C, 0x84, 0xB7,
+ 0xBF, 0xF1, 0x68, 0x4B, 0xAE, 0x0F, 0x7F, 0x45, 0x3B, 0x18,
+ 0x3F, 0xFA, 0x00, 0x48, 0xE0, 0x3A, 0xE2, 0xC0, 0xAE, 0x00,
+ 0xCE, 0x90, 0x28, 0xA4, 0x1B, 0xBE, 0xCA, 0x0C, 0x21, 0x29,
+ 0x64, 0x30, 0x5E, 0x35, 0xAD, 0xFD, 0x83, 0x47, 0x40, 0x6D,
+ 0x15, 0x56, 0xFC, 0xF8, 0x5F, 0xAB, 0x81, 0xFE, 0x6B, 0xE9,
+ 0x6B, 0xED, 0x27, 0x35, 0x7C, 0xD8, 0x2C, 0xD4, 0xF2, 0x11,
+ 0xE6, 0xAF, 0xDF, 0xB8, 0x91, 0x96, 0xEB, 0xF7, 0x4C, 0x8D,
+ 0x70, 0x77, 0x90, 0x81, 0x00, 0x09, 0x19, 0x27, 0x8A, 0x9E,
+ 0xB6, 0x1A, 0xE9, 0xAC, 0x6C, 0xC9, 0xF8, 0xEA, 0xA2, 0x34,
+ 0xB8, 0xAC, 0xB3, 0xB3, 0x68, 0xA1, 0xB7, 0x29, 0x55, 0xCA,
+ 0x40, 0x23, 0x92, 0x5C, 0x0C, 0x79, 0x6B, 0xD6, 0x9F, 0x5B,
+ 0xD2, 0xE6, 0xAE, 0x04, 0xCB, 0xEC, 0xC7, 0x88, 0x18, 0xDB,
+ 0x7A, 0xE6, 0xD6, 0xC9, 0x39, 0xFD, 0x93, 0x9B, 0xC8, 0x01,
+ 0x6F, 0x3E, 0x6C, 0x90, 0x3E, 0x73, 0x76, 0x99, 0x7C, 0x48,
+ 0xDA, 0x68, 0x48, 0x80, 0x2B, 0x63,
+ }
+ var ssExp = [16]byte{
+ 0xA1, 0xF9, 0x5A, 0x67, 0xB9, 0x3D, 0x1E, 0x72, 0xE8, 0xC5,
+ 0x71, 0xF1, 0x4C, 0xB2, 0xAA, 0x6D,
+ }
var prvObj = NewPrivateKey(KeyVariant_SIKE)
var pubObj = NewPublicKey(KeyVariant_SIKE)
- if pubObj.Import(pk) != nil || prvObj.Import(sk[:]) != nil {
+ if pubObj.Import(pk[:]) != nil || prvObj.Import(sk[:]) != nil {
t.Error("Can't import one of the keys")
}
- res, _ := Decapsulate(prvObj, pubObj, ct)
- if !bytes.Equal(ss_exp, res) {
+ res, _ := Decapsulate(prvObj, pubObj, ct[:])
+ if !bytes.Equal(ssExp[:], res) {
t.Error("Wrong decapsulation result")
}
}
@@ -629,10 +636,10 @@
Benchmarking
-------------------------------------------------------------------------*/
-func BenchmarkSidhKeyAgreementP503(b *testing.B) {
+func BenchmarkSidhKeyAgreement(b *testing.B) {
// KeyPairs
- alicePublic := convToPub(tdata.PkA_sike, KeyVariant_SIDH_A)
- alicePrivate := convToPrv(tdata.PrA_sike, KeyVariant_SIDH_A)
+ alicePublic := convToPub(tdata.PkA_sidh, KeyVariant_SIDH_A)
+ alicePrivate := convToPrv(tdata.PrA_sidh, KeyVariant_SIDH_A)
bobPublic := convToPub(tdata.PkB_sidh, KeyVariant_SIDH_B)
bobPrivate := convToPrv(tdata.PrB_sidh, KeyVariant_SIDH_B)
@@ -643,21 +650,21 @@
}
}
-func BenchmarkAliceKeyGenPrvP503(b *testing.B) {
+func BenchmarkAliceKeyGenPrv(b *testing.B) {
prv := NewPrivateKey(KeyVariant_SIDH_A)
for n := 0; n < b.N; n++ {
prv.Generate(rand.Reader)
}
}
-func BenchmarkBobKeyGenPrvP503(b *testing.B) {
+func BenchmarkBobKeyGenPrv(b *testing.B) {
prv := NewPrivateKey(KeyVariant_SIDH_B)
for n := 0; n < b.N; n++ {
prv.Generate(rand.Reader)
}
}
-func BenchmarkAliceKeyGenPubP503(b *testing.B) {
+func BenchmarkAliceKeyGenPub(b *testing.B) {
prv := NewPrivateKey(KeyVariant_SIDH_A)
prv.Generate(rand.Reader)
for n := 0; n < b.N; n++ {
@@ -665,7 +672,7 @@
}
}
-func BenchmarkBobKeyGenPubP503(b *testing.B) {
+func BenchmarkBobKeyGenPub(b *testing.B) {
prv := NewPrivateKey(KeyVariant_SIDH_B)
prv.Generate(rand.Reader)
for n := 0; n < b.N; n++ {
@@ -673,17 +680,17 @@
}
}
-func BenchmarkSharedSecretAliceP503(b *testing.B) {
- aPr := convToPrv(tdata.PrA_sike, KeyVariant_SIDH_A)
+func BenchmarkSharedSecretAlice(b *testing.B) {
+ aPr := convToPrv(tdata.PrA_sidh, KeyVariant_SIDH_A)
bPk := convToPub(tdata.PkB_sike, KeyVariant_SIDH_B)
for n := 0; n < b.N; n++ {
DeriveSecret(aPr, bPk)
}
}
-func BenchmarkSharedSecretBobP503(b *testing.B) {
+func BenchmarkSharedSecretBob(b *testing.B) {
// m_B = 3*randint(0,3^238)
- aPk := convToPub(tdata.PkA_sike, KeyVariant_SIDH_A)
+ aPk := convToPub(tdata.PkA_sidh, KeyVariant_SIDH_A)
bPr := convToPrv(tdata.PrB_sidh, KeyVariant_SIDH_B)
for n := 0; n < b.N; n++ {
DeriveSecret(bPr, aPk)
diff --git a/third_party/sike/P503.c b/third_party/sike/P503.c
deleted file mode 100644
index b8463e7..0000000
--- a/third_party/sike/P503.c
+++ /dev/null
@@ -1,100 +0,0 @@
-/********************************************************************************************
-* SIDH: an efficient supersingular isogeny cryptography library
-*
-* Abstract: supersingular isogeny parameters and generation of functions for P503
-*********************************************************************************************/
-
-#include "utils.h"
-
-// Parameters for isogeny system "SIKEp503"
-const struct params_t p503 = {
- .prime = {
- U64_TO_WORDS(0xFFFFFFFFFFFFFFFF), U64_TO_WORDS(0xFFFFFFFFFFFFFFFF),
- U64_TO_WORDS(0xFFFFFFFFFFFFFFFF), U64_TO_WORDS(0xABFFFFFFFFFFFFFF),
- U64_TO_WORDS(0x13085BDA2211E7A0), U64_TO_WORDS(0x1B9BF6C87B7E7DAF),
- U64_TO_WORDS(0x6045C6BDDA77A4D0), U64_TO_WORDS(0x004066F541811E1E)
- },
- .prime_p1 = {
- U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000),
- U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0xAC00000000000000),
- U64_TO_WORDS(0x13085BDA2211E7A0), U64_TO_WORDS(0x1B9BF6C87B7E7DAF),
- U64_TO_WORDS(0x6045C6BDDA77A4D0), U64_TO_WORDS(0x004066F541811E1E)
- },
- .prime_x2 = {
- U64_TO_WORDS(0xFFFFFFFFFFFFFFFE), U64_TO_WORDS(0xFFFFFFFFFFFFFFFF),
- U64_TO_WORDS(0xFFFFFFFFFFFFFFFF), U64_TO_WORDS(0x57FFFFFFFFFFFFFF),
- U64_TO_WORDS(0x2610B7B44423CF41), U64_TO_WORDS(0x3737ED90F6FCFB5E),
- U64_TO_WORDS(0xC08B8D7BB4EF49A0), U64_TO_WORDS(0x0080CDEA83023C3C)
- },
- .A_gen = {
- U64_TO_WORDS(0xE7EF4AA786D855AF), U64_TO_WORDS(0xED5758F03EB34D3B),
- U64_TO_WORDS(0x09AE172535A86AA9), U64_TO_WORDS(0x237B9CC07D622723),
- U64_TO_WORDS(0xE3A284CBA4E7932D), U64_TO_WORDS(0x27481D9176C5E63F),
- U64_TO_WORDS(0x6A323FF55C6E71BF), U64_TO_WORDS(0x002ECC31A6FB8773), // XPA0
- U64_TO_WORDS(0x64D02E4E90A620B8), U64_TO_WORDS(0xDAB8128537D4B9F1),
- U64_TO_WORDS(0x4BADF77B8A228F98), U64_TO_WORDS(0x0F5DBDF9D1FB7D1B),
- U64_TO_WORDS(0xBEC4DB288E1A0DCC), U64_TO_WORDS(0xE76A8665E80675DB),
- U64_TO_WORDS(0x6D6F252E12929463), U64_TO_WORDS(0x003188BD1463FACC), // XPA1
- U64_TO_WORDS(0xB79D41025DE85D56), U64_TO_WORDS(0x0B867DA9DF169686),
- U64_TO_WORDS(0x740E5368021C827D), U64_TO_WORDS(0x20615D72157BF25C),
- U64_TO_WORDS(0xFF1590013C9B9F5B), U64_TO_WORDS(0xC884DCADE8C16CEA),
- U64_TO_WORDS(0xEBD05E53BF724E01), U64_TO_WORDS(0x0032FEF8FDA5748C), // XQA0
- U64_TO_WORDS(0x12E2E849AA0A8006), U64_TO_WORDS(0x41CF47008635A1E8),
- U64_TO_WORDS(0x9CD720A70798AED7), U64_TO_WORDS(0x42A820B42FCF04CF),
- U64_TO_WORDS(0x7BF9BAD32AAE88B1), U64_TO_WORDS(0xF619127A54090BBE),
- U64_TO_WORDS(0x1CB10D8F56408EAA), U64_TO_WORDS(0x001D6B54C3C0EDEB), // XRA0
- U64_TO_WORDS(0x34DB54931CBAAC36), U64_TO_WORDS(0x420A18CB8DD5F0C4),
- U64_TO_WORDS(0x32008C1A48C0F44D), U64_TO_WORDS(0x3B3BA772B1CFD44D),
- U64_TO_WORDS(0xA74B058FDAF13515), U64_TO_WORDS(0x095FC9CA7EEC17B4),
- U64_TO_WORDS(0x448E829D28F120F8), U64_TO_WORDS(0x00261EC3ED16A489) // XRA1
- },
- .B_gen = {
- U64_TO_WORDS(0x7EDE37F4FA0BC727), U64_TO_WORDS(0xF7F8EC5C8598941C),
- U64_TO_WORDS(0xD15519B516B5F5C8), U64_TO_WORDS(0xF6D5AC9B87A36282),
- U64_TO_WORDS(0x7B19F105B30E952E), U64_TO_WORDS(0x13BD8B2025B4EBEE),
- U64_TO_WORDS(0x7B96D27F4EC579A2), U64_TO_WORDS(0x00140850CAB7E5DE), // XPB0
- U64_TO_WORDS(0x7764909DAE7B7B2D), U64_TO_WORDS(0x578ABB16284911AB),
- U64_TO_WORDS(0x76E2BFD146A6BF4D), U64_TO_WORDS(0x4824044B23AA02F0),
- U64_TO_WORDS(0x1105048912A321F3), U64_TO_WORDS(0xB8A2E482CF0F10C1),
- U64_TO_WORDS(0x42FF7D0BE2152085), U64_TO_WORDS(0x0018E599C5223352), // XPB1
- U64_TO_WORDS(0x4256C520FB388820), U64_TO_WORDS(0x744FD7C3BAAF0A13),
- U64_TO_WORDS(0x4B6A2DDDB12CBCB8), U64_TO_WORDS(0xE46826E27F427DF8),
- U64_TO_WORDS(0xFE4A663CD505A61B), U64_TO_WORDS(0xD6B3A1BAF025C695),
- U64_TO_WORDS(0x7C3BB62B8FCC00BD), U64_TO_WORDS(0x003AFDDE4A35746C), // XQB0
- U64_TO_WORDS(0x75601CD1E6C0DFCB), U64_TO_WORDS(0x1A9007239B58F93E),
- U64_TO_WORDS(0xC1F1BE80C62107AC), U64_TO_WORDS(0x7F513B898F29FF08),
- U64_TO_WORDS(0xEA0BEDFF43E1F7B2), U64_TO_WORDS(0x2C6D94018CBAE6D0),
- U64_TO_WORDS(0x3A430D31BCD84672), U64_TO_WORDS(0x000D26892ECCFE83), // XRB0
- U64_TO_WORDS(0x1119D62AEA3007A1), U64_TO_WORDS(0xE3702AA4E04BAE1B),
- U64_TO_WORDS(0x9AB96F7D59F990E7), U64_TO_WORDS(0xF58440E8B43319C0),
- U64_TO_WORDS(0xAF8134BEE1489775), U64_TO_WORDS(0xE7F7774E905192AA),
- U64_TO_WORDS(0xF54AE09308E98039), U64_TO_WORDS(0x001EF7A041A86112) // XRB1
- },
- .mont_R2 = {
- U64_TO_WORDS(0x5289A0CF641D011F), U64_TO_WORDS(0x9B88257189FED2B9),
- U64_TO_WORDS(0xA3B365D58DC8F17A), U64_TO_WORDS(0x5BC57AB6EFF168EC),
- U64_TO_WORDS(0x9E51998BD84D4423), U64_TO_WORDS(0xBF8999CBAC3B5695),
- U64_TO_WORDS(0x46E9127BCE14CDB6), U64_TO_WORDS(0x003F6CFCE8B81771)
- },
- .mont_one = {
- U64_TO_WORDS(0x00000000000003F9), U64_TO_WORDS(0x0000000000000000),
- U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0xB400000000000000),
- U64_TO_WORDS(0x63CB1A6EA6DED2B4), U64_TO_WORDS(0x51689D8D667EB37D),
- U64_TO_WORDS(0x8ACD77C71AB24142), U64_TO_WORDS(0x0026FBAEC60F5953)
- },
- .A_strat = {
- 61, 32, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1,
- 4, 2, 1, 1, 2, 1, 1, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1,
- 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 29, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1,
- 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 13, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2,
- 1, 1, 2, 1, 1, 5, 4, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1
- },
- .B_strat = {
- 71, 38, 21, 13, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 5, 4, 2, 1, 1, 2, 1,
- 1, 2, 1, 1, 1, 9, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1, 17, 9,
- 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 1, 2, 1,
- 1, 4, 2, 1, 1, 2, 1, 1, 33, 17, 9, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1,
- 2, 1, 1, 8, 4, 2, 1, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 16, 8, 4, 2, 1, 1, 1, 2,
- 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1
- }
-};
diff --git a/third_party/sike/asm/fp-armv8.pl b/third_party/sike/asm/fp-armv8.pl
index a1728d1..ce19d80 100644
--- a/third_party/sike/asm/fp-armv8.pl
+++ b/third_party/sike/asm/fp-armv8.pl
@@ -2,7 +2,7 @@
#
# April 2019
#
-# Abstract: field arithmetic in aarch64 assembly for SIDH/p503
+# Abstract: field arithmetic in aarch64 assembly for SIDH/p434
$flavour = shift;
$output = shift;
@@ -21,21 +21,23 @@
$code.=<<___;
.section .rodata
-.Lp503p1_nz_s8:
- .quad 0x085BDA2211E7A0AC, 0x9BF6C87B7E7DAF13
- .quad 0x45C6BDDA77A4D01B, 0x4066F541811E1E60
-
-.Lp503x2:
+# p434 x 2
+.Lp434x2:
.quad 0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF
- .quad 0x57FFFFFFFFFFFFFF, 0x2610B7B44423CF41
- .quad 0x3737ED90F6FCFB5E, 0xC08B8D7BB4EF49A0
- .quad 0x0080CDEA83023C3C
+ .quad 0xFB82ECF5C5FFFFFF, 0xF78CB8F062B15D47
+ .quad 0xD9F8BFAD038A40AC, 0x0004683E4E2EE688
+
+# p434 + 1
+.Lp434p1:
+ .quad 0xFDC1767AE3000000, 0x7BC65C783158AEA3
+ .quad 0x6CFC5FD681C52056, 0x0002341F27177344
.text
___
-# C[0-2] = A[0] * B[0-1]
-sub mul64x128_comba_cut {
+# Computes C0-C2 = A0 * (B0-B1)
+# Inputs remain intact
+sub mul64x128 {
my ($A0,$B0,$B1,$C0,$C1,$C2,$T0,$T1)=@_;
my $body=<<___;
mul $T1, $A0, $B0
@@ -55,14 +57,161 @@
return $body;
}
-sub mul256_karatsuba_comba {
+# Computes C0-C4 = A0 * (B0-B3)
+# Inputs remain intact
+sub mul64x256 {
+ my ($A0,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$T0,$T1,$T2)=@_;
+ my $body=<<___;
+ mul $C0, $A0, $B0 // C0
+ umulh $T0, $A0, $B0
+
+ mul $C1, $A0, $B1
+ umulh $T1, $A0, $B1
+ adds $C1, $C1, $T0 // C1
+ adc $T0, xzr, xzr
+
+ mul $C2, $A0, $B2
+ umulh $T2, $A0, $B2
+ adds $T1, $T0, $T1
+ adcs $C2, $C2, $T1 // C2
+ adc $T0, xzr, xzr
+
+ mul $C3, $A0, $B3
+ umulh $C4, $A0, $B3
+ adds $T2, $T0, $T2
+ adcs $C3, $C3, $T2 // C3
+ adc $C4, $C4, xzr // C4
+___
+ return $body;
+}
+
+# Computes C0-C4 = (A0-A1) * (B0-B3)
+# Inputs remain intact
+sub mul128x256 {
+ my ($A0,$A1,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$C5,$T0,$T1,$T2,$T3)=@_;
+ my $body=<<___;
+ mul $C0, $A0, $B0 // C0
+ umulh $C3, $A0, $B0
+
+ mul $C1, $A0, $B1
+ umulh $C2, $A0, $B1
+
+ mul $T0, $A1, $B0
+ umulh $T1, $A1, $B0
+ adds $C1, $C1, $C3
+ adc $C2, $C2, xzr
+
+ mul $T2, $A0, $B2
+ umulh $T3, $A0, $B2
+ adds $C1, $C1, $T0 // C1
+ adcs $C2, $C2, $T1
+ adc $C3, xzr, xzr
+
+ mul $T0, $A1, $B1
+ umulh $T1, $A1, $B1
+ adds $C2, $C2, $T2
+ adcs $C3, $C3, $T3
+ adc $C4, xzr, xzr
+
+ mul $T2, $A0, $B3
+ umulh $T3, $A0, $B3
+ adds $C2, $C2, $T0 // C2
+ adcs $C3, $C3, $T1
+ adc $C4, $C4, xzr
+
+ mul $T0, $A1, $B2
+ umulh $T1, $A1, $B2
+ adds $C3, $C3, $T2
+ adcs $C4, $C4, $T3
+ adc $C5, xzr, xzr
+
+ mul $T2, $A1, $B3
+ umulh $T3, $A1, $B3
+ adds $C3, $C3, $T0 // C3
+ adcs $C4, $C4, $T1
+ adc $C5, $C5, xzr
+ adds $C4, $C4, $T2 // C4
+ adc $C5, $C5, $T3 // C5
+
+___
+ return $body;
+}
+
+# Computes C0-C5 = (A0-A2) * (B0-B2)
+# Inputs remain intact
+sub mul192 {
+ my ($A0,$A1,$A2,$B0,$B1,$B2,$C0,$C1,$C2,$C3,$C4,$C5,$T0,$T1,$T2,$T3)=@_;
+ my $body=<<___;
+
+ // A0 * B0
+ mul $C0, $A0, $B0 // C0
+ umulh $C3, $A0, $B0
+
+ // A0 * B1
+ mul $C1, $A0, $B1
+ umulh $C2, $A0, $B1
+
+ // A1 * B0
+ mul $T0, $A1, $B0
+ umulh $T1, $A1, $B0
+ adds $C1, $C1, $C3
+ adc $C2, $C2, xzr
+
+ // A0 * B2
+ mul $T2, $A0, $B2
+ umulh $T3, $A0, $B2
+ adds $C1, $C1, $T0 // C1
+ adcs $C2, $C2, $T1
+ adc $C3, xzr, xzr
+
+ // A2 * B0
+ mul $T0, $A2, $B0
+ umulh $C4, $A2, $B0
+ adds $C2, $C2, $T2
+ adcs $C3, $C3, $C4
+ adc $C4, xzr, xzr
+
+ // A1 * B1
+ mul $T2, $A1, $B1
+ umulh $T1, $A1, $B1
+ adds $C2, $C2, $T0
+ adcs $C3, $C3, $T3
+ adc $C4, $C4, xzr
+
+ // A1 * B2
+ mul $T0, $A1, $B2
+ umulh $T3, $A1, $B2
+ adds $C2, $C2, $T2 // C2
+ adcs $C3, $C3, $T1
+ adc $C4, $C4, xzr
+
+ // A2 * B1
+ mul $T2, $A2, $B1
+ umulh $T1, $A2, $B1
+ adds $C3, $C3, $T0
+ adcs $C4, $C4, $T3
+ adc $C5, xzr, xzr
+
+ // A2 * B2
+ mul $T0, $A2, $B2
+ umulh $T3, $A2, $B2
+ adds $C3, $C3, $T2 // C3
+ adcs $C4, $C4, $T1
+ adc $C5, $C5, xzr
+
+ adds $C4, $C4, $T0 // C4
+ adc $C5, $C5, $T3 // C5
+___
+ return $body;
+}
+sub mul256_karatsuba {
my ($M,$A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$C5,$C6,$C7,$T0,$T1)=@_;
# (AH+AL) x (BH+BL), low part
- my $mul_low=&mul64x128_comba_cut($A1, $C6, $T1, $C3, $C4, $C5, $C7, $A0);
+ my $mul_low=&mul64x128($A1, $C6, $T1, $C3, $C4, $C5, $C7, $A0);
# AL x BL
- my $mul_albl=&mul64x128_comba_cut($A1, $B0, $B1, $C1, $T1, $C7, $C6, $A0);
+ my $mul_albl=&mul64x128($A1, $B0, $B1, $C1, $T1, $C7, $C6, $A0);
# AH x BH
- my $mul_ahbh=&mul64x128_comba_cut($A3, $B2, $B3, $A1, $C6, $B0, $B1, $A2);
+ my $mul_ahbh=&mul64x128($A3, $B2, $B3, $A1, $C6, $B0, $B1, $A2);
my $body=<<___;
// A0-A1 <- AH + AL, T0 <- mask
adds $A0, $A0, $A2
@@ -146,26 +295,25 @@
# Operation: c [x2] = a [x0] * b [x1]
sub mul {
# (AH+AL) x (BH+BL), low part
- my $mul_kc_low=&mul256_karatsuba_comba(
+ my $mul_kc_low=&mul256_karatsuba(
"x2", # M0
"x3","x4","x5","x6", # A0-A3
- "x11","x12","x13","x14", # B0-B3
- "x8","x9","x10","x20","x21","x22","x23","x24", # C0-C7
+ "x10","x11","x12","x13", # B0-B3
+ "x8","x9","x19","x20","x21","x22","x23","x24", # C0-C7
"x25","x26"); # TMP
# AL x BL
- my $mul_albl=&mul256_karatsuba_comba(
- "x0", # M0
+ my $mul_albl=&mul256_karatsuba(
+ "x0", # M0f
"x3","x4","x5","x6", # A0-A3
- "x11","x12","x13","x14", # B0-B3
+ "x10","x11","x12","x13", # B0-B3
"x21","x22","x23","x24","x25","x26","x27","x28",# C0-C7
"x8","x9"); # TMP
# AH x BH
- my $mul_ahbh=&mul256_karatsuba_comba(
- "x0", # M0
- "x3","x4","x5","x6", # A0-A3
- "x11","x12","x13","x14", # B0-B3
- "x21","x22","x23","x24","x25","x26","x27","x28",# C0-C7
- "x8","x9"); # TMP
+ my $mul_ahbh=&mul192(
+ "x3","x4","x5", # A0-A2
+ "x10","x11","x12", # B0-B2
+ "x21","x22","x23","x24","x25","x26", # C0-C5
+ "x8","x9","x27","x28"); # TMP
my $body=<<___;
.global ${PREFIX}_mpmul
@@ -179,27 +327,27 @@
stp x25, x26, [sp,#64]
stp x27, x28, [sp,#80]
- ldp x3, x4, [x0]
- ldp x5, x6, [x0,#16]
- ldp x7, x8, [x0,#32]
- ldp x9, x10, [x0,#48]
- ldp x11, x12, [x1,#0]
- ldp x13, x14, [x1,#16]
- ldp x15, x16, [x1,#32]
- ldp x17, x19, [x1,#48]
+ ldp x3, x4, [x0]
+ ldp x5, x6, [x0,#16]
+ ldp x7, x8, [x0,#32]
+ ldr x9, [x0,#48]
+ ldp x10, x11, [x1,#0]
+ ldp x12, x13, [x1,#16]
+ ldp x14, x15, [x1,#32]
+ ldr x16, [x1,#48]
// x3-x7 <- AH + AL, x7 <- carry
adds x3, x3, x7
adcs x4, x4, x8
adcs x5, x5, x9
- adcs x6, x6, x10
+ adcs x6, x6, xzr
adc x7, xzr, xzr
- // x11-x14 <- BH + BL, x8 <- carry
- adds x11, x11, x15
+ // x10-x13 <- BH + BL, x8 <- carry
+ adds x10, x10, x14
+ adcs x11, x11, x15
adcs x12, x12, x16
- adcs x13, x13, x17
- adcs x14, x14, x19
+ adcs x13, x13, xzr
adc x8, xzr, xzr
// x9 <- combined carry
@@ -208,12 +356,11 @@
sub x7, xzr, x7
sub x8, xzr, x8
-
// x15-x19 <- masked (BH + BL)
+ and x14, x10, x7
and x15, x11, x7
and x16, x12, x7
and x17, x13, x7
- and x19, x14, x7
// x20-x23 <- masked (AH + AL)
and x20, x3, x8
@@ -222,46 +369,46 @@
and x23, x6, x8
// x15-x19, x7 <- masked (AH+AL) + masked (BH+BL), step 1
- adds x15, x15, x20
- adcs x16, x16, x21
- adcs x17, x17, x22
- adcs x19, x19, x23
+ adds x14, x14, x20
+ adcs x15, x15, x21
+ adcs x16, x16, x22
+ adcs x17, x17, x23
adc x7, x9, xzr
- // x8-x10,x20-x24 <- (AH+AL) x (BH+BL), low part
+ // x8-x9,x19,x20-x24 <- (AH+AL) x (BH+BL), low part
stp x3, x4, [x2,#0]
$mul_kc_low
// x15-x19, x7 <- (AH+AL) x (BH+BL), final step
- adds x15, x15, x21
- adcs x16, x16, x22
- adcs x17, x17, x23
- adcs x19, x19, x24
+ adds x14, x14, x21
+ adcs x15, x15, x22
+ adcs x16, x16, x23
+ adcs x17, x17, x24
adc x7, x7, xzr
// Load AL
ldp x3, x4, [x0]
ldp x5, x6, [x0,#16]
// Load BL
- ldp x11, x12, [x1,#0]
- ldp x13, x14, [x1,#16]
+ ldp x10, x11, [x1,#0]
+ ldp x12, x13, [x1,#16]
- // Temporarily store x8,x9 in x2
- stp x8,x9, [x2,#0]
+ // Temporarily store x8 in x2
+ stp x8, x9, [x2,#0]
// x21-x28 <- AL x BL
$mul_albl
- // Restore x8,x9
- ldp x8,x9, [x2,#0]
+ // Restore x8
+ ldp x8, x9, [x2,#0]
// x8-x10,x20,x15-x17,x19 <- maskd (AH+AL) x (BH+BL) - ALxBL
subs x8, x8, x21
sbcs x9, x9, x22
- sbcs x10, x10, x23
+ sbcs x19, x19, x23
sbcs x20, x20, x24
- sbcs x15, x15, x25
- sbcs x16, x16, x26
- sbcs x17, x17, x27
- sbcs x19, x19, x28
+ sbcs x14, x14, x25
+ sbcs x15, x15, x26
+ sbcs x16, x16, x27
+ sbcs x17, x17, x28
sbc x7, x7, xzr
// Store ALxBL, low
@@ -270,14 +417,14 @@
// Load AH
ldp x3, x4, [x0,#32]
- ldp x5, x6, [x0,#48]
+ ldr x5, [x0,#48]
// Load BH
- ldp x11, x12, [x1,#32]
- ldp x13, x14, [x1,#48]
+ ldp x10, x11, [x1,#32]
+ ldr x12, [x1,#48]
- adds x8, x8, x25
- adcs x9, x9, x26
- adcs x10, x10, x27
+ adds x8, x8, x25
+ adcs x9, x9, x26
+ adcs x19, x19, x27
adcs x20, x20, x28
adc x1, xzr, xzr
@@ -291,35 +438,32 @@
neg x1, x1
- // x8-x10,x20,x15-x17,x19 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
+ // x8-x9,x19,x20,x14-x17 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
subs x8, x8, x21
sbcs x9, x9, x22
- sbcs x10, x10, x23
+ sbcs x19, x19, x23
sbcs x20, x20, x24
- sbcs x15, x15, x25
- sbcs x16, x16, x26
- sbcs x17, x17, x27
- sbcs x19, x19, x28
+ sbcs x14, x14, x25
+ sbcs x15, x15, x26
+ sbcs x16, x16, xzr
+ sbcs x17, x17, xzr
sbc x7, x7, xzr
// Store (AH+AL) x (BH+BL) - ALxBL - AHxBH, low
- stp x8, x9, [x2,#32]
- stp x10, x20, [x2,#48]
+ stp x8, x9, [x2,#32]
+ stp x19, x20, [x2,#48]
- adds x1, x1, #1
- adcs x15, x15, x21
- adcs x16, x16, x22
- adcs x17, x17, x23
- adcs x19, x19, x24
- adcs x25, x7, x25
- adcs x26, x26, xzr
- adcs x27, x27, xzr
- adc x28, x28, xzr
+ adds x1, x1, #1
+ adcs x14, x14, x21
+ adcs x15, x15, x22
+ adcs x16, x16, x23
+ adcs x17, x17, x24
+ adcs x25, x7, x25
+ adc x26, x26, xzr
- stp x15, x16, [x2,#64]
- stp x17, x19, [x2,#80]
+ stp x14, x15, [x2,#64]
+ stp x16, x17, [x2,#80]
stp x25, x26, [x2,#96]
- stp x27, x28, [x2,#112]
ldp x19, x20, [x29,#16]
ldp x21, x22, [x29,#32]
@@ -333,250 +477,120 @@
}
$code.=&mul();
-# Computes C0-C4 = (A0-A1) * (B0-B3)
-# Inputs remain intact
-sub mul128x256_comba {
- my ($A0,$A1,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$T0,$T1,$T2,$T3)=@_;
- my $body=<<___;
- mul $T0, $A1, $B0
- umulh $T1, $A1, $B0
- adds $C0, $C0, $C2
- adc $C1, $C1, xzr
-
- mul $T2, $A0, $B2
- umulh $T3, $A0, $B2
- adds $C0, $C0, $T0
- adcs $C1, $C1, $T1
- adc $C2, xzr, xzr
-
- mul $T0, $A1, $B1
- umulh $T1, $A1, $B1
- adds $C1, $C1, $T2
- adcs $C2, $C2, $T3
- adc $C3, xzr, xzr
-
- mul $T2, $A0, $B3
- umulh $T3, $A0, $B3
- adds $C1, $C1, $T0
- adcs $C2, $C2, $T1
- adc $C3, $C3, xzr
-
- mul $T0, $A1, $B2
- umulh $T1, $A1, $B2
- adds $C2, $C2, $T2
- adcs $C3, $C3, $T3
- adc $C4, xzr, xzr
-
- mul $T2, $A1, $B3
- umulh $T3, $A1, $B3
- adds $C2, $C2, $T0
- adcs $C3, $C3, $T1
- adc $C4, $C4, xzr
- adds $C3, $C3, $T2
- adc $C4, $C4, $T3
-
-___
- return $body;
-}
-
# Montgomery reduction
# Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015
# Operation: mc [x1] = ma [x0]
# NOTE: ma=mc is not allowed
sub rdc {
- my $mul01=&mul128x256_comba(
- "x2","x3", # A0-A1
- "x24","x25","x26","x27", # B0-B3
- "x5","x6","x7","x8","x9", # C0-B4
- "x1","x10","x11","x19"); # TMP
- my $mul23=&mul128x256_comba(
- "x2","x3", # A0-A1
- "x24","x25","x26","x27", # B0-B3
- "x5","x6","x7","x8","x9", # C0-C4
- "x1","x10","x11","x19"); # TMP
- my $mul45=&mul128x256_comba(
- "x12","x13", # A0-A1
- "x24","x25","x26","x27", # B0-B3
- "x5","x6","x7","x8","x9", # C0-C4
- "x1","x10","x11","x19"); # TMP
- my $mul67=&mul128x256_comba(
- "x14","x15", # A0-A1
- "x24","x25","x26","x27", # B0-B3
- "x5","x6","x7","x8","x9", # C0-C4
- "x1","x10","x11","x19"); # TMP
+ my $mul01=&mul128x256(
+ "x2","x3", # A0-A1
+ "x23","x24","x25","x26", # B0-B3
+ "x4","x5","x6","x7","x8","x9", # C0-C5
+ "x10","x11","x27","x28"); # TMP
+ my $mul23=&mul128x256(
+ "x2","x10", # A0-A1
+ "x23","x24","x25","x26", # B0-B3
+ "x4","x5","x6","x7","x8","x9", # C0-C5
+ "x0","x3","x27","x28"); # TMP
+ my $mul45=&mul128x256(
+ "x11","x12", # A0-A1
+ "x23","x24","x25","x26", # B0-B3
+ "x4","x5","x6","x7","x8","x9", # C0-C5
+ "x10","x3","x27","x28"); # TMP
+ my $mul67=&mul64x256(
+ "x13", # A0
+ "x23","x24","x25","x26", # B0-B3
+ "x4","x5","x6","x7","x8", # C0-C4
+ "x10","x27","x28"); # TMP
my $body=<<___;
.global ${PREFIX}_fprdc
.align 4
${PREFIX}_fprdc:
- stp x29, x30, [sp, #-112]!
- add x29, sp, #0
+ stp x29, x30, [sp, #-96]!
+ add x29, sp, xzr
stp x19, x20, [sp,#16]
stp x21, x22, [sp,#32]
stp x23, x24, [sp,#48]
stp x25, x26, [sp,#64]
stp x27, x28, [sp,#80]
- str x1, [sp,#96]
ldp x2, x3, [x0,#0] // a[0-1]
// Load the prime constant
- adrp x23, :pg_hi21:.Lp503p1_nz_s8
- add x23, x23, :lo12:.Lp503p1_nz_s8
- ldp x24, x25, [x23, #0]
- ldp x26, x27, [x23, #16]
+ adrp x26, :pg_hi21:.Lp434p1
+ add x26, x26, :lo12:.Lp434p1
+ ldp x23, x24, [x26, #0x0]
+ ldp x25, x26, [x26,#0x10]
- // a[0-1] x .Lp503p1_nz_s8 --> result: x4:x9
- mul x4, x2, x24 // a[0] x .Lp503p1_nz_s8[0]
- umulh x7, x2, x24
- mul x5, x2, x25 // a[0] x .Lp503p1_nz_s8[1]
- umulh x6, x2, x25
-
+ // a[0-1] * p434+1
$mul01
- ldp x2, x3, [x0,#16] // a[2]
- ldp x12, x13, [x0,#32]
- ldp x14, x15, [x0,#48]
+ ldp x10, x11, [x0, #0x18]
+ ldp x12, x13, [x0, #0x28]
+ ldp x14, x15, [x0, #0x38]
+ ldp x16, x17, [x0, #0x48]
+ ldp x19, x20, [x0, #0x58]
+ ldr x21, [x0, #0x68]
- orr x10, xzr, x9, lsr #8
- lsl x9, x9, #56
- orr x9, x9, x8, lsr #8
- lsl x8, x8, #56
- orr x8, x8, x7, lsr #8
- lsl x7, x7, #56
- orr x7, x7, x6, lsr #8
- lsl x6, x6, #56
- orr x6, x6, x5, lsr #8
- lsl x5, x5, #56
- orr x5, x5, x4, lsr #8
- lsl x4, x4, #56
+ adds x10, x10, x4
+ adcs x11, x11, x5
+ adcs x12, x12, x6
+ adcs x13, x13, x7
+ adcs x14, x14, x8
+ adcs x15, x15, x9
+ adcs x22, x16, xzr
+ adcs x17, x17, xzr
+ adcs x19, x19, xzr
+ adcs x20, x20, xzr
+ adc x21, x21, xzr
- adds x3, x4, x3 // a[3]
- adcs x12, x5, x12 // a[4]
- adcs x13, x6, x13
- adcs x14, x7, x14
- adcs x15, x8, x15
- ldp x16, x17, [x0,#64]
- ldp x28, x30, [x0,#80]
- mul x4, x2, x24 // a[2] x .Lp503p1_nz_s8[0]
- umulh x7, x2, x24
- adcs x16, x9, x16
- adcs x17, x10, x17
- adcs x28, xzr, x28
- adcs x30, xzr, x30
- ldp x20, x21, [x0,#96]
- ldp x22, x23, [x0,#112]
- mul x5, x2, x25 // a[2] x .Lp503p1_nz_s8[1]
- umulh x6, x2, x25
- adcs x20, xzr, x20
- adcs x21, xzr, x21
- adcs x22, xzr, x22
- adc x23, xzr, x23
-
- // a[2-3] x .Lp503p1_nz_s8 --> result: x4:x9
+ ldr x2, [x0,#0x10] // a[2]
+ // a[2-3] * p434+1
$mul23
- orr x10, xzr, x9, lsr #8
- lsl x9, x9, #56
- orr x9, x9, x8, lsr #8
- lsl x8, x8, #56
- orr x8, x8, x7, lsr #8
- lsl x7, x7, #56
- orr x7, x7, x6, lsr #8
- lsl x6, x6, #56
- orr x6, x6, x5, lsr #8
- lsl x5, x5, #56
- orr x5, x5, x4, lsr #8
- lsl x4, x4, #56
+ adds x12, x12, x4
+ adcs x13, x13, x5
+ adcs x14, x14, x6
+ adcs x15, x15, x7
+ adcs x16, x22, x8
+ adcs x17, x17, x9
+ adcs x22, x19, xzr
+ adcs x20, x20, xzr
+ adc x21, x21, xzr
- adds x13, x4, x13 // a[5]
- adcs x14, x5, x14 // a[6]
- adcs x15, x6, x15
- adcs x16, x7, x16
- mul x4, x12, x24 // a[4] x .Lp503p1_nz_s8[0]
- umulh x7, x12, x24
- adcs x17, x8, x17
- adcs x28, x9, x28
- adcs x30, x10, x30
- adcs x20, xzr, x20
- mul x5, x12, x25 // a[4] x .Lp503p1_nz_s8[1]
- umulh x6, x12, x25
- adcs x21, xzr, x21
- adcs x22, xzr, x22
- adc x23, xzr, x23
-
- // a[4-5] x .Lp503p1_nz_s8 --> result: x4:x9
$mul45
+ adds x14, x14, x4
+ adcs x15, x15, x5
+ adcs x16, x16, x6
+ adcs x17, x17, x7
+ adcs x19, x22, x8
+ adcs x20, x20, x9
+ adc x22, x21, xzr
- orr x10, xzr, x9, lsr #8
- lsl x9, x9, #56
- orr x9, x9, x8, lsr #8
- lsl x8, x8, #56
- orr x8, x8, x7, lsr #8
- lsl x7, x7, #56
- orr x7, x7, x6, lsr #8
- lsl x6, x6, #56
- orr x6, x6, x5, lsr #8
- lsl x5, x5, #56
- orr x5, x5, x4, lsr #8
- lsl x4, x4, #56
+ stp x14, x15, [x1, #0x0] // C0, C1
- adds x15, x4, x15 // a[7]
- adcs x16, x5, x16 // a[8]
- adcs x17, x6, x17
- adcs x28, x7, x28
- mul x4, x14, x24 // a[6] x .Lp503p1_nz_s8[0]
- umulh x7, x14, x24
- adcs x30, x8, x30
- adcs x20, x9, x20
- adcs x21, x10, x21
- mul x5, x14, x25 // a[6] x .Lp503p1_nz_s8[1]
- umulh x6, x14, x25
- adcs x22, xzr, x22
- adc x23, xzr, x23
-
- // a[6-7] x .Lp503p1_nz_s8 --> result: x4:x9
$mul67
+ adds x16, x16, x4
+ adcs x17, x17, x5
+ adcs x19, x19, x6
+ adcs x20, x20, x7
+ adc x21, x22, x8
- orr x10, xzr, x9, lsr #8
- lsl x9, x9, #56
- orr x9, x9, x8, lsr #8
- lsl x8, x8, #56
- orr x8, x8, x7, lsr #8
- lsl x7, x7, #56
- orr x7, x7, x6, lsr #8
- lsl x6, x6, #56
- orr x6, x6, x5, lsr #8
- lsl x5, x5, #56
- orr x5, x5, x4, lsr #8
- lsl x4, x4, #56
-
- adds x17, x4, x17
- adcs x28, x5, x28
- ldr x1, [sp,#96]
- adcs x30, x6, x30
- adcs x20, x7, x20
- stp x16, x17, [x1,#0] // Final result
- stp x28, x30, [x1,#16]
- adcs x21, x8, x21
- adcs x22, x9, x22
- adc x23, x10, x23
- stp x20, x21, [x1,#32]
- stp x22, x23, [x1,#48]
+ str x16, [x1, #0x10]
+ stp x17, x19, [x1, #0x18]
+ stp x20, x21, [x1, #0x28]
ldp x19, x20, [x29,#16]
ldp x21, x22, [x29,#32]
ldp x23, x24, [x29,#48]
ldp x25, x26, [x29,#64]
ldp x27, x28, [x29,#80]
- ldp x29, x30, [sp],#112
+ ldp x29, x30, [sp],#96
ret
-
___
}
-
$code.=&rdc();
-
# Field addition
# Operation: c [x2] = a [x0] + b [x1]
$code.=<<___;
@@ -588,49 +602,44 @@
ldp x3, x4, [x0,#0]
ldp x5, x6, [x0,#16]
+ ldp x7, x8, [x0,#32]
+ ldr x9, [x0,#48]
ldp x11, x12, [x1,#0]
ldp x13, x14, [x1,#16]
+ ldp x15, x16, [x1,#32]
+ ldr x17, [x1,#48]
// Add a + b
adds x3, x3, x11
adcs x4, x4, x12
adcs x5, x5, x13
adcs x6, x6, x14
- ldp x7, x8, [x0,#32]
- ldp x9, x10, [x0,#48]
- ldp x11, x12, [x1,#32]
- ldp x13, x14, [x1,#48]
- adcs x7, x7, x11
- adcs x8, x8, x12
- adcs x9, x9, x13
- adc x10, x10, x14
+ adcs x7, x7, x15
+ adcs x8, x8, x16
+ adc x9, x9, x17
- // Subtract 2xp503
- adrp x17, :pg_hi21:.Lp503x2
- add x17, x17, :lo12:.Lp503x2
+ // Subtract 2xp434
+ adrp x17, :pg_hi21:.Lp434x2
+ add x17, x17, :lo12:.Lp434x2
ldp x11, x12, [x17, #0]
ldp x13, x14, [x17, #16]
+ ldp x15, x16, [x17, #32]
subs x3, x3, x11
sbcs x4, x4, x12
sbcs x5, x5, x12
sbcs x6, x6, x13
sbcs x7, x7, x14
-
- ldp x15, x16, [x17, #32]
- ldr x17, [x17, #48]
sbcs x8, x8, x15
sbcs x9, x9, x16
- sbcs x10, x10, x17
sbc x0, xzr, xzr // x0 can be reused now
- // Add 2xp503 anded with the mask in x0
+ // Add 2xp434 anded with the mask in x0
and x11, x11, x0
and x12, x12, x0
and x13, x13, x0
and x14, x14, x0
and x15, x15, x0
and x16, x16, x0
- and x17, x17, x0
adds x3, x3, x11
adcs x4, x4, x12
@@ -638,17 +647,15 @@
adcs x6, x6, x13
adcs x7, x7, x14
adcs x8, x8, x15
- adcs x9, x9, x16
- adc x10, x10, x17
+ adc x9, x9, x16
stp x3, x4, [x2,#0]
stp x5, x6, [x2,#16]
stp x7, x8, [x2,#32]
- stp x9, x10, [x2,#48]
+ str x9, [x2,#48]
ldp x29, x30, [sp],#16
ret
-
___
# Field subtraction
@@ -662,60 +669,58 @@
ldp x3, x4, [x0,#0]
ldp x5, x6, [x0,#16]
+ ldp x7, x8, [x0,#32]
+ ldr x9, [x0,#48]
ldp x11, x12, [x1,#0]
ldp x13, x14, [x1,#16]
+ ldp x15, x16, [x1,#32]
+ ldr x17, [x1,#48]
// Subtract a - b
subs x3, x3, x11
sbcs x4, x4, x12
sbcs x5, x5, x13
sbcs x6, x6, x14
- ldp x7, x8, [x0,#32]
- ldp x11, x12, [x1,#32]
- sbcs x7, x7, x11
- sbcs x8, x8, x12
- ldp x9, x10, [x0,#48]
- ldp x11, x12, [x1,#48]
- sbcs x9, x9, x11
- sbcs x10, x10, x12
- sbc x17, xzr, xzr
+ sbcs x7, x7, x15
+ sbcs x8, x8, x16
+ sbcs x9, x9, x17
+ sbc x0, xzr, xzr
- // Add 2xp503 anded with the mask in x17
- adrp x16, :pg_hi21:.Lp503x2
- add x16, x16, :lo12:.Lp503x2
+ // Add 2xp434 anded with the mask in x0
+ adrp x17, :pg_hi21:.Lp434x2
+ add x17, x17, :lo12:.Lp434x2
// First half
- ldp x11, x12, [x16, #0]
- ldp x13, x14, [x16, #16]
- and x11, x11, x17
- and x12, x12, x17
- and x13, x13, x17
+ ldp x11, x12, [x17, #0]
+ ldp x13, x14, [x17, #16]
+ ldp x15, x16, [x17, #32]
+
+ // Add 2xp434 anded with the mask in x0
+ and x11, x11, x0
+ and x12, x12, x0
+ and x13, x13, x0
+ and x14, x14, x0
+ and x15, x15, x0
+ and x16, x16, x0
+
adds x3, x3, x11
adcs x4, x4, x12
adcs x5, x5, x12
adcs x6, x6, x13
+ adcs x7, x7, x14
+ adcs x8, x8, x15
+ adc x9, x9, x16
+
stp x3, x4, [x2,#0]
stp x5, x6, [x2,#16]
-
- // Second half
- ldp x11, x12, [x16, #32]
- ldr x13, [x16, #48]
- and x14, x14, x17
- and x11, x11, x17
- and x12, x12, x17
- and x13, x13, x17
- adcs x7, x7, x14
- adcs x8, x8, x11
- adcs x9, x9, x12
- adc x10, x10, x13
stp x7, x8, [x2,#32]
- stp x9, x10, [x2,#48]
+ str x9, [x2,#48]
ldp x29, x30, [sp],#16
ret
___
-# 503-bit multiprecision addition
+# 434-bit multiprecision addition
# Operation: c [x2] = a [x0] + b [x1]
$code.=<<___;
.global ${PREFIX}_mpadd_asm
@@ -726,92 +731,31 @@
ldp x3, x4, [x0,#0]
ldp x5, x6, [x0,#16]
+ ldp x7, x8, [x0,#32]
+ ldr x9, [x0,#48]
ldp x11, x12, [x1,#0]
ldp x13, x14, [x1,#16]
+ ldp x15, x16, [x1,#32]
+ ldr x17, [x1,#48]
adds x3, x3, x11
adcs x4, x4, x12
adcs x5, x5, x13
adcs x6, x6, x14
- ldp x7, x8, [x0,#32]
- ldp x9, x10, [x0,#48]
- ldp x11, x12, [x1,#32]
- ldp x13, x14, [x1,#48]
- adcs x7, x7, x11
- adcs x8, x8, x12
- adcs x9, x9, x13
- adc x10, x10, x14
+ adcs x7, x7, x15
+ adcs x8, x8, x16
+ adc x9, x9, x17
stp x3, x4, [x2,#0]
stp x5, x6, [x2,#16]
stp x7, x8, [x2,#32]
- stp x9, x10, [x2,#48]
+ str x9, [x2,#48]
ldp x29, x30, [sp],#16
ret
___
-
-# 2x503-bit multiprecision addition
-# Operation: c [x2] = a [x0] + b [x1]
-$code.=<<___;
- .global ${PREFIX}_mpadd503x2_asm
- .align 4
- ${PREFIX}_mpadd503x2_asm:
- stp x29, x30, [sp,#-16]!
- add x29, sp, #0
-
- ldp x3, x4, [x0,#0]
- ldp x5, x6, [x0,#16]
- ldp x11, x12, [x1,#0]
- ldp x13, x14, [x1,#16]
- adds x3, x3, x11
- adcs x4, x4, x12
- adcs x5, x5, x13
- adcs x6, x6, x14
- ldp x7, x8, [x0,#32]
- ldp x9, x10, [x0,#48]
- ldp x11, x12, [x1,#32]
- ldp x13, x14, [x1,#48]
- adcs x7, x7, x11
- adcs x8, x8, x12
- adcs x9, x9, x13
- adcs x10, x10, x14
-
- stp x3, x4, [x2,#0]
- stp x5, x6, [x2,#16]
- stp x7, x8, [x2,#32]
- stp x9, x10, [x2,#48]
-
- ldp x3, x4, [x0,#64]
- ldp x5, x6, [x0,#80]
- ldp x11, x12, [x1,#64]
- ldp x13, x14, [x1,#80]
- adcs x3, x3, x11
- adcs x4, x4, x12
- adcs x5, x5, x13
- adcs x6, x6, x14
- ldp x7, x8, [x0,#96]
- ldp x9, x10, [x0,#112]
- ldp x11, x12, [x1,#96]
- ldp x13, x14, [x1,#112]
- adcs x7, x7, x11
- adcs x8, x8, x12
- adcs x9, x9, x13
- adc x10, x10, x14
-
- stp x3, x4, [x2,#64]
- stp x5, x6, [x2,#80]
- stp x7, x8, [x2,#96]
- stp x9, x10, [x2,#112]
-
- ldp x29, x30, [sp],#16
- ret
-___
-
-
-
-# 2x503-bit multiprecision subtraction
+# 2x434-bit multiprecision subtraction
# Operation: c [x2] = a [x0] - b [x1].
# Returns borrow mask
$code.=<<___;
@@ -852,111 +796,114 @@
sbcs x5, x5, x13
sbcs x6, x6, x14
ldp x7, x8, [x0,#96]
- ldp x9, x10, [x0,#112]
ldp x11, x12, [x1,#96]
- ldp x13, x14, [x1,#112]
sbcs x7, x7, x11
sbcs x8, x8, x12
- sbcs x9, x9, x13
- sbcs x10, x10, x14
sbc x0, xzr, xzr
stp x3, x4, [x2,#64]
stp x5, x6, [x2,#80]
stp x7, x8, [x2,#96]
- stp x9, x10, [x2,#112]
ldp x29, x30, [sp],#16
ret
___
-# Double 2x503-bit multiprecision subtraction
+# Double 2x434-bit multiprecision subtraction
# Operation: c [x2] = c [x2] - a [x0] - b [x1]
$code.=<<___;
.global ${PREFIX}_mpdblsubx2_asm
.align 4
${PREFIX}_mpdblsubx2_asm:
- stp x29, x30, [sp, #-64]!
+ stp x29, x30, [sp, #-16]!
add x29, sp, #0
- stp x20, x21, [sp, #16]
- stp x22, x23, [sp, #32]
- str x24, [sp, #48]
-
- ldp x3, x4, [x2,#0]
+ ldp x3, x4, [x2, #0]
ldp x5, x6, [x2,#16]
ldp x7, x8, [x2,#32]
- ldp x9, x10, [x2,#48]
- ldp x11, x12, [x2,#64]
- ldp x13, x14, [x2,#80]
- ldp x15, x16, [x2,#96]
- ldp x17, x24, [x2,#112]
- ldp x20, x21, [x0,#0]
- ldp x22, x23, [x0,#16]
- subs x3, x3, x20
- sbcs x4, x4, x21
- sbcs x5, x5, x22
- sbcs x6, x6, x23
- ldp x20, x21, [x0,#32]
- ldp x22, x23, [x0,#48]
- sbcs x7, x7, x20
- sbcs x8, x8, x21
- sbcs x9, x9, x22
- sbcs x10, x10, x23
- ldp x20, x21, [x0,#64]
- ldp x22, x23, [x0,#80]
- sbcs x11, x11, x20
- sbcs x12, x12, x21
- sbcs x13, x13, x22
- sbcs x14, x14, x23
- ldp x20, x21, [x0,#96]
- ldp x22, x23, [x0,#112]
- sbcs x15, x15, x20
- sbcs x16, x16, x21
- sbcs x17, x17, x22
- sbc x24, x24, x23
+ ldp x11, x12, [x0, #0]
+ ldp x13, x14, [x0,#16]
+ ldp x15, x16, [x0,#32]
- ldp x20, x21, [x1,#0]
- ldp x22, x23, [x1,#16]
- subs x3, x3, x20
- sbcs x4, x4, x21
- sbcs x5, x5, x22
- sbcs x6, x6, x23
- ldp x20, x21, [x1,#32]
- ldp x22, x23, [x1,#48]
- sbcs x7, x7, x20
- sbcs x8, x8, x21
- sbcs x9, x9, x22
- sbcs x10, x10, x23
- ldp x20, x21, [x1,#64]
- ldp x22, x23, [x1,#80]
- sbcs x11, x11, x20
- sbcs x12, x12, x21
- sbcs x13, x13, x22
- sbcs x14, x14, x23
- ldp x20, x21, [x1,#96]
- ldp x22, x23, [x1,#112]
- sbcs x15, x15, x20
- sbcs x16, x16, x21
- sbcs x17, x17, x22
- sbc x24, x24, x23
+ subs x3, x3, x11
+ sbcs x4, x4, x12
+ sbcs x5, x5, x13
+ sbcs x6, x6, x14
+ sbcs x7, x7, x15
+ sbcs x8, x8, x16
- stp x3, x4, [x2,#0]
+ // x9 stores carry
+ adc x9, xzr, xzr
+
+ ldp x11, x12, [x1, #0]
+ ldp x13, x14, [x1,#16]
+ ldp x15, x16, [x1,#32]
+ subs x3, x3, x11
+ sbcs x4, x4, x12
+ sbcs x5, x5, x13
+ sbcs x6, x6, x14
+ sbcs x7, x7, x15
+ sbcs x8, x8, x16
+ adc x9, x9, xzr
+
+ stp x3, x4, [x2, #0]
stp x5, x6, [x2,#16]
stp x7, x8, [x2,#32]
- stp x9, x10, [x2,#48]
- stp x11, x12, [x2,#64]
- stp x13, x14, [x2,#80]
- stp x15, x16, [x2,#96]
- stp x17, x24, [x2,#112]
- ldp x20, x21, [x29,#16]
- ldp x22, x23, [x29,#32]
- ldr x24, [x29,#48]
+ ldp x3, x4, [x2,#48]
+ ldp x5, x6, [x2,#64]
+ ldp x7, x8, [x2,#80]
- ldp x29, x30, [sp],#64
+ ldp x11, x12, [x0,#48]
+ ldp x13, x14, [x0,#64]
+ ldp x15, x16, [x0,#80]
+
+ // x9 = 2 - x9
+ neg x9, x9
+ add x9, x9, #2
+
+ subs x3, x3, x9
+ sbcs x3, x3, x11
+ sbcs x4, x4, x12
+ sbcs x5, x5, x13
+ sbcs x6, x6, x14
+ sbcs x7, x7, x15
+ sbcs x8, x8, x16
+ adc x9, xzr, xzr
+
+ ldp x11, x12, [x1,#48]
+ ldp x13, x14, [x1,#64]
+ ldp x15, x16, [x1,#80]
+ subs x3, x3, x11
+ sbcs x4, x4, x12
+ sbcs x5, x5, x13
+ sbcs x6, x6, x14
+ sbcs x7, x7, x15
+ sbcs x8, x8, x16
+ adc x9, x9, xzr
+
+ stp x3, x4, [x2,#48]
+ stp x5, x6, [x2,#64]
+ stp x7, x8, [x2,#80]
+
+ ldp x3, x4, [x2,#96]
+ ldp x11, x12, [x0,#96]
+ ldp x13, x14, [x1,#96]
+
+ // x9 = 2 - x9
+ neg x9, x9
+ add x9, x9, #2
+
+ subs x3, x3, x9
+ sbcs x3, x3, x11
+ sbcs x4, x4, x12
+ subs x3, x3, x13
+ sbc x4, x4, x14
+ stp x3, x4, [x2,#96]
+
+ ldp x29, x30, [sp],#16
ret
___
diff --git a/third_party/sike/asm/fp-x86_64.pl b/third_party/sike/asm/fp-x86_64.pl
index c093c20..cffde1a 100755
--- a/third_party/sike/asm/fp-x86_64.pl
+++ b/third_party/sike/asm/fp-x86_64.pl
@@ -2,7 +2,7 @@
#
# April 2019
#
-# Abstract: field arithmetic in x64 assembly for SIDH/p503
+# Abstract: field arithmetic in x64 assembly for SIDH/p434
$flavour = shift;
$output = shift;
@@ -22,76 +22,341 @@
$code.=<<___;
.text
-# p503 x 2
-.Lp503x2:
+# p434 x 2
+.Lp434x2:
.quad 0xFFFFFFFFFFFFFFFE
.quad 0xFFFFFFFFFFFFFFFF
-.quad 0x57FFFFFFFFFFFFFF
-.quad 0x2610B7B44423CF41
-.quad 0x3737ED90F6FCFB5E
-.quad 0xC08B8D7BB4EF49A0
-.quad 0x0080CDEA83023C3C
+.quad 0xFB82ECF5C5FFFFFF
+.quad 0xF78CB8F062B15D47
+.quad 0xD9F8BFAD038A40AC
+.quad 0x0004683E4E2EE688
-# p503 + 1
-.Lp503p1:
-.quad 0xAC00000000000000
-.quad 0x13085BDA2211E7A0
-.quad 0x1B9BF6C87B7E7DAF
-.quad 0x6045C6BDDA77A4D0
-.quad 0x004066F541811E1E
-
-.Lp503p1_nz:
-.quad 0xAC00000000000000
-.quad 0x13085BDA2211E7A0
-.quad 0x1B9BF6C87B7E7DAF
-.quad 0x6045C6BDDA77A4D0
-.quad 0x004066F541811E1E
+# p434 + 1
+.Lp434p1:
+.quad 0xFDC1767AE3000000
+.quad 0x7BC65C783158AEA3
+.quad 0x6CFC5FD681C52056
+.quad 0x0002341F27177344
.extern OPENSSL_ia32cap_P
.hidden OPENSSL_ia32cap_P
-
___
-# Performs schoolbook multiplication of 128-bit with 320-bit
-# number. Uses MULX, ADOX, ADCX instruction.
-sub mul128x320_school {
- my ($idxM0,$M0,$M1,$T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7,$T8,$T9)=@_;
- my ($MUL0,$MUL8)=map("$idxM0+$_(%$M0)", (0,8));
+# Jump to alternative implemenatation provided as an
+# argument in case CPU supports ADOX/ADCX and MULX instructions.
+sub alt_impl {
+ $jmp_func = shift;
+
+ $body=<<___;
+ lea OPENSSL_ia32cap_P(%rip), %rcx
+ mov 8(%rcx), %rcx
+ and \$0x80100, %ecx
+ cmp \$0x80100, %ecx
+ je $jmp_func
+
+___
+ return $body
+}
+
+# Performs schoolbook multiplication of 2 192-bit numbers. Uses
+# MULX instruction. Result is stored in 192 bits pointed by $DST.
+sub mul192 {
+ my ($idxM0,$M0,$idxM1,$M1,$idxDST,$DST,$T0,$T1,$T2,$T3,$T4,$T5,$T6)=@_;
+ my ($ML0,$ML8,$ML16)=map("$idxM0+$_($M0)",(0,8,16));
+ my ($MR0,$MR8,$MR16)=map("$idxM1+$_($M1)",(0,8,16));
+ my ($D0,$D1,$D2,$D3,$D4,$D5)=map("$idxDST+$_($DST)",(0,8,16,24,32,40));
+
+ $body=<<___;
+ mov $ML0, %rdx
+ mulx $MR0, $T1, $T0 # T0:T1 = A0*B0
+ mov $T1, $D0 # DST0
+ mulx $MR8, $T2, $T1 # T1:T2 = A0*B1
+ xor %rax, %rax
+ adox $T2, $T0
+ mulx $MR16,$T3, $T2 # T2:T3 = A0*B2
+ adox $T3, $T1
+
+ mov $ML8, %rdx
+ mulx $MR0, $T4, $T3 # T3:T4 = A1*B0
+ adox %rax, $T2
+ xor %rax, %rax
+
+ mulx $MR8, $T6, $T5 # T6:T7 = A1*B1
+ adox $T0, $T4
+ mov $T4, $D1 # DST1
+ adcx $T6, $T3
+
+ mulx $MR16,$T0, $T6 # T6:T0 = A1*B2
+ adox $T1, $T3
+ adcx $T0, $T5
+ adcx %rax, $T6
+ adox $T2, $T5
+
+ mov $ML16,%rdx
+ mulx $MR0, $T0, $T1 # T1:T0 = A2*B0
+ adox %rax, $T6
+ xor %rax, %rax
+
+ mulx $MR8, $T2, $T4 # T4:T2 = A2*B1
+ adox $T3, $T0
+ mov $T0, $D2 # DST2
+ adcx $T5, $T1
+
+ mulx $MR16,$T3, $T0 # T0:T3 = A2*B2
+ adcx $T6, $T4
+ adcx %rax, $T0
+ adox $T2, $T1
+ adox $T4, $T3
+ adox %rax, $T0
+ mov $T1, $D3 # DST3
+ mov $T3, $D4 # DST4
+ mov $T0, $D5 # DST5
+
+___
+ return $body;
+}
+
+# Performs schoolbook multiplication of 2 256-bit numbers. Uses
+# MULX instruction. Result is stored in 256 bits pointed by $DST.
+sub mul256 {
+ my ($idxM0,$M0,$idxM1,$M1,$idxDST,$DST,$T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7,$T8,$T9)=@_;
+ my ($ML0,$ML8,$ML16,$ML24)=map("$idxM0+$_($M0)",(0,8,16,24));
+ my ($MR0,$MR8,$MR16,$MR24)=map("$idxM1+$_($M1)",(0,8,16,24));
+ my ($D0,$D1,$D2,$D3,$D4,$D5,$D6,$D7)=map("$idxDST+$_($DST)",(0,8,16,24,32,40,48,56));
+
+ $body=<<___;
+ mov $ML0, %rdx
+ mulx $MR0, $T1, $T0 # T0:T1 = A0*B0
+ mov $T1, $D0 # DST0_final
+ mulx $MR8, $T2, $T1 # T1:T2 = A0*B1
+ xor %rax, %rax
+ adox $T2, $T0
+ mulx $MR16,$T3, $T2 # T2:T3 = A0*B2
+ adox $T3, $T1
+ mulx $MR24,$T4, $T3 # T3:T4 = A0*B3
+ adox $T4, $T2
+
+ mov $ML8, %rdx
+ mulx $MR0, $T4, $T5 # T5:T4 = A1*B0
+ adox %rax, $T3
+ xor %rax, %rax
+ mulx $MR8, $T7, $T6 # T6:T7 = A1*B1
+ adox $T0, $T4
+ mov $T4, $D1 # DST1_final
+ adcx $T7, $T5
+ mulx $MR16,$T8, $T7 # T7:T8 = A1*B2
+ adcx $T8, $T6
+ adox $T1, $T5
+ mulx $MR24,$T9, $T8 # T8:T9 = A1*B3
+ adcx $T9, $T7
+ adcx %rax, $T8
+ adox $T2, $T6
+
+ mov $ML16,%rdx
+ mulx $MR0, $T0, $T1 # T1:T0 = A2*B0
+ adox $T3, $T7
+ adox %rax, $T8
+ xor %rax, %rax
+ mulx $MR8, $T3, $T2 # T2:T3 = A2*B1
+ adox $T5, $T0
+ mov $T0, $D2 # DST2_final
+ adcx $T3, $T1
+ mulx $MR16,$T4, $T3 # T3:T4 = A2*B2
+ adcx $T4, $T2
+ adox $T6, $T1
+ mulx $MR24,$T9, $T4 # T3:T4 = A2*B3
+ adcx $T9, $T3
+ adcx %rax, $T4
+
+ adox $T7, $T2
+ adox $T8, $T3
+ adox %rax, $T4
+
+ mov $ML24,%rdx
+ mulx $MR0, $T0, $T5 # T5:T0 = A3*B0
+ xor %rax, %rax
+ mulx $MR8, $T7, $T6 # T6:T7 = A3*B1
+ adcx $T7, $T5
+ adox $T0, $T1
+ mulx $MR16, $T8, $T7 # T7:T8 = A3*B2
+ adcx $T8, $T6
+ adox $T5, $T2
+ mulx $MR24, $T9, $T8 # T8:T9 = A3*B3
+ adcx $T9, $T7
+ adcx %rax, $T8
+ adox $T6, $T3
+ adox $T7, $T4
+ adox %rax, $T8
+ mov $T1, $D3 # DST3_final
+ mov $T2, $D4 # DST4_final
+ mov $T3, $D5 # DST5_final
+ mov $T4, $D6 # DST6_final
+ mov $T8, $D7 # DST7_final
+
+___
+ return $body;
+}
+
+# Performs schoolbook multiplication of 64-bit with 256-bit
+# number.
+sub mul64x256 {
+ my ($idxM0,$M0,$M1,$T0,$T1,$T2,$T3,$T4,$T5)=@_;
my $body.=<<___;
+ mov $idxM0($M0), $T5
+
+ xor $T2, $T2
+ mov 0+$M1, %rax
+ mul $T5
+ mov %rax, $T0 # C0
+ mov %rdx, $T1
+
+ xor $T3, $T3
+ mov 8+$M1, %rax
+ mul $T5
+ add %rax, $T1 # C1
+ adc %rdx, $T2
+
+ xor $T4, $T4
+ mov 16+$M1, %rax
+ mul $T5
+ add %rax, $T2 # C2
+ adc %rdx, $T3
+
+ mov 24+$M1, %rax
+ mul $T5
+ add %rax, $T3 # C3
+ adc %rdx, $T4 # C4
+___
+ return $body;
+}
+
+# Performs schoolbook multiplication of 64-bit with 256-bit
+# number. Uses MULX and ADOX instructions.
+sub mulx64x256 {
+ my ($idxM0,$M0,$M1,$T0,$T1,$T2,$T3,$T4,$T5)=@_;
+ my $body.=<<___;
+ xor %rax, %rax
+ mov $idxM0($M0), %rdx
+ mulx 0+$M1, $T0, $T1 # T0 <- C0
+ mulx 8+$M1, $T4, $T2
+ mulx 16+$M1, $T5, $T3
+
+ adox $T4, $T1 # T1 <- C1
+ adox $T5, $T2 # T2 <- C2
+
+ mulx 24+$M1, $T5, $T4
+ adox $T5, $T3 # T3 <- C3
+ adox %rax, $T4 # T4 <- C4
+___
+ return $body;
+}
+
+# Performs schoolbook multiplication of 128-bit with 256-bit
+# number. Destroys RAX and RDX
+sub mul128x256 {
+ my ($idxMA,$MA,$MB,$C0,$C1,$C2,$C3,$C4,$C5,$T0,$T1)=@_;
+ my ($MA0,$MA8)=map("$idxMA+$_($MA)", (0,8));
+ my $body.=<<___;
+ # A0 x B0
+ mov $MA0, $T0
+ mov 0+$MB, %rax
+ mul $T0
+ xor $C2, $C2
+ mov %rax, $C0 # c0
+ mov %rdx, $C1
+
+ # A0 x B1
+ mov 8+$MB, %rax
+ mul $T0
+ xor $C3, $C3
+ add %rax, $C1
+ adc %rdx, $C2
+
+ # A1 x B0
+ mov $MA8, $T1
+ mov 0+$MB, %rax
+ mul $T1
+ add %rax, $C1
+ adc %rdx, $C2
+ adc \$0x0, $C3
+
+ # A0 x B2
+ xor $C4, $C4
+ mov 16+$MB, %rax
+ mul $T0
+ add %rax, $C2
+ adc %rdx, $C3
+ adc \$0x0, $C4
+
+ # A1 x B1
+ mov 8+$MB, %rax
+ mul $T1
+ add %rax, $C2 # c2
+ adc %rdx, $C3
+ adc \$0x0, $C4
+
+ # A0 x B3
+ mov 24+$MB, %rax
+ mul $T0
+ xor $C5, $C5
+ add %rax, $C3
+ adc %rdx, $C4
+ adc \$0x0, $C5
+
+ # A1 x B2
+ mov 16+$MB, %rax
+ mul $T1
+ add %rax, $C3 # c3
+ adc %rdx, $C4
+ adc \$0x0, $C5
+
+ # A1 x B3
+ mov 24+$MB, %rax
+ mul $T1
+ add %rax, $C4
+ adc %rdx, $C5
+
+___
+ return $body;
+}
+
+# Performs schoolbook multiplication of 128-bit with 256-bit
+# number. Uses MULX, ADOX, ADCX instruction.
+sub mulx128x256 {
+ my ($idxM0,$M0,$M1,$T0,$T1,$T2,$T3,$T4,$T5,$T6)=@_;
+ my ($MUL0,$MUL8)=map("$idxM0+$_($M0)", (0,8));
+ my $body.=<<___;
+ xor %rax, %rax
mov $MUL0, %rdx
- mulx 0+$M1, %$T0, %$T1 # T0 <- C0_final
- mulx 8+$M1, %$T4, %$T2
+ mulx 0+$M1, $T0, $T1 # T0 <- C0
+ mulx 8+$M1, $T4, $T2
+ mulx 16+$M1, $T5, $T3
+
+ adox $T4, $T1 # T1: interm1
+ adox $T5, $T2 # T2: interm2
+
+ mulx 24+$M1, $T5, $T4
+ adox $T5, $T3 # T3: interm3
+ adox %rax, $T4 # T4: interm4
xor %rax, %rax
- mulx 16+$M1, %$T5, %$T3
- adox %$T4, %$T1
- adox %$T5, %$T2
- mulx 24+$M1, %$T7, %$T4
- adox %$T7, %$T3
- mulx 32+$M1, %$T6, %$T5
- adox %$T6, %$T4
- adox %rax, %$T5
-
mov $MUL8, %rdx
- mulx 0+$M1, %$T6, %$T7
- adcx %$T6, %$T1 # T1 <- C1_final
- adcx %$T7, %$T2
- mulx 8+$M1, %$T8, %$T6
- adcx %$T6, %$T3
- mulx 16+$M1, %$T7, %$T9
- adcx %$T9, %$T4
- mulx 24+$M1, %$T9, %$T6
- adcx %$T6, %$T5
- mulx 32+$M1, %rdx, %$T6
- adcx %rax, %$T6
+ mulx 0+$M1, $T5, $T6
+ adcx $T5, $T1 # T1 <- C1
+ adcx $T6, $T2
- xor %rax, %rax
- adox %$T8, %$T2
- adox %$T7, %$T3
- adox %$T9, %$T4
- adox %rdx, %$T5
- adox %rax, %$T6
+ mulx 8+$M1, $T6, $T5
+ adcx $T5, $T3
+ adox $T6, $T2 # T2 <- C2
+ mulx 16+$M1, $T6, $T5
+ adcx $T5, $T4
+ adox $T6, $T3 # T3 <- C3
+
+ mulx 24+$M1, $T6, $T5
+ adcx %rax, $T5
+ adox $T6, $T4 # T4 <- C4
+ adox %rax, $T5 # T5 <- C5
___
return $body;
}
@@ -112,87 +377,72 @@
push %r14
.cfi_adjust_cfa_offset 8
.cfi_offset r14, -32
- push %r15
-.cfi_adjust_cfa_offset 8
-.cfi_offset r15, -40
- xor %rax, %rax
+ xor %rax, %rax
- mov 0x0(%rdi), %r8
- mov 0x8(%rdi), %r9
- mov 0x10(%rdi), %r10
- mov 0x18(%rdi), %r11
- mov 0x20(%rdi), %r12
- mov 0x28(%rdi), %r13
- mov 0x30(%rdi), %r14
- mov 0x38(%rdi), %r15
+ mov 0x0(%rdi), %r8
+ add 0x0(%rsi), %r8
+ mov 0x8(%rdi), %r9
+ adc 0x8(%rsi), %r9
+ mov 0x10(%rdi), %r10
+ adc 0x10(%rsi), %r10
+ mov 0x18(%rdi), %r11
+ adc 0x18(%rsi), %r11
+ mov 0x20(%rdi), %r12
+ adc 0x20(%rsi), %r12
+ mov 0x28(%rdi), %r13
+ adc 0x28(%rsi), %r13
+ mov 0x30(%rdi), %r14
+ adc 0x30(%rsi), %r14
- add 0x0(%rsi), %r8
- adc 0x8(%rsi), %r9
- adc 0x10(%rsi), %r10
- adc 0x18(%rsi), %r11
- adc 0x20(%rsi), %r12
- adc 0x28(%rsi), %r13
- adc 0x30(%rsi), %r14
- adc 0x38(%rsi), %r15
+ mov .Lp434x2(%rip), %rcx
+ sub %rcx, %r8
+ mov 0x8+.Lp434x2(%rip), %rcx
+ sbb %rcx, %r9
+ sbb %rcx, %r10
+ mov 0x10+.Lp434x2(%rip), %rcx
+ sbb %rcx, %r11
+ mov 0x18+.Lp434x2(%rip), %rcx
+ sbb %rcx, %r12
+ mov 0x20+.Lp434x2(%rip), %rcx
+ sbb %rcx, %r13
+ mov 0x28+.Lp434x2(%rip), %rcx
+ sbb %rcx, %r14
- mov .Lp503x2(%rip), %rcx;
- sub %rcx, %r8
- mov 8+.Lp503x2(%rip), %rcx;
- sbb %rcx, %r9
- sbb %rcx, %r10
- mov 16+.Lp503x2(%rip), %rcx;
- sbb %rcx, %r11
- mov 24+.Lp503x2(%rip), %rcx;
- sbb %rcx, %r12
- mov 32+.Lp503x2(%rip), %rcx;
- sbb %rcx, %r13
- mov 40+.Lp503x2(%rip), %rcx;
- sbb %rcx, %r14
- mov 48+.Lp503x2(%rip), %rcx;
- sbb %rcx, %r15
- sbb \$0, %rax
+ sbb \$0, %rax
- mov .Lp503x2(%rip), %rdi
- and %rax, %rdi
- mov 8+.Lp503x2(%rip), %rsi
- and %rax, %rsi
- mov 16+.Lp503x2(%rip), %rcx
- and %rax, %rcx
+ mov .Lp434x2(%rip), %rdi
+ and %rax, %rdi
+ mov 0x8+.Lp434x2(%rip), %rsi
+ and %rax, %rsi
+ mov 0x10+.Lp434x2(%rip), %rcx
+ and %rax, %rcx
- add %rdi, %r8
- mov %r8, 0x0(%rdx)
- adc %rsi, %r9
- mov %r9, 0x8(%rdx)
- adc %rsi, %r10
- mov %r10, 0x10(%rdx)
- adc %rcx, %r11
- mov %r11, 0x18(%rdx)
+ add %rdi, %r8
+ mov %r8, 0x0(%rdx)
+ adc %rsi, %r9
+ mov %r9, 0x8(%rdx)
+ adc %rsi, %r10
+ mov %r10, 0x10(%rdx)
+ adc %rcx, %r11
+ mov %r11, 0x18(%rdx)
- setc %cl
-
- mov 24+.Lp503x2(%rip), %r8
- and %rax, %r8
- mov 32+.Lp503x2(%rip), %r9
- and %rax, %r9
- mov 40+.Lp503x2(%rip), %r10
- and %rax, %r10
- mov 48+.Lp503x2(%rip), %r11
- and %rax, %r11
-
+ setc %cl
+ mov 0x18+.Lp434x2(%rip), %r8
+ and %rax, %r8
+ mov 0x20+.Lp434x2(%rip), %r9
+ and %rax, %r9
+ mov 0x28+.Lp434x2(%rip), %r10
+ and %rax, %r10
bt \$0, %rcx
- adc %r8, %r12
- mov %r12, 0x20(%rdx)
- adc %r9, %r13
- mov %r13, 0x28(%rdx)
- adc %r10, %r14
- mov %r14, 0x30(%rdx)
- adc %r11, %r15
- mov %r15, 0x38(%rdx)
+ adc %r8, %r12
+ mov %r12, 0x20(%rdx)
+ adc %r9, %r13
+ mov %r13, 0x28(%rdx)
+ adc %r10, %r14
+ mov %r14, 0x30(%rdx)
- pop %r15
-.cfi_adjust_cfa_offset -8
pop %r14
.cfi_adjust_cfa_offset -8
pop %r13
@@ -203,8 +453,6 @@
.cfi_endproc
___
-
-
# Loads data to XMM0 and XMM1 and
# conditionaly swaps depending on XMM3
sub cswap_block16() {
@@ -226,15 +474,11 @@
# Conditionally swaps bits in x and y in constant time.
# mask indicates bits to be swapped (set bits are swapped)
# Operation: [rdi] <-> [rsi] if rdx==1
-sub cswap {
- # P[0].X with Q[0].X
- foreach ( 0.. 3){$BLOCKS.=eval "&cswap_block16($_)";}
- # P[0].Z with Q[0].Z
- foreach ( 4.. 7){$BLOCKS.=eval "&cswap_block16($_)";}
- # P[1].X with Q[1].X
- foreach ( 8..11){$BLOCKS.=eval "&cswap_block16($_)";}
- # P[1].Z with Q[1].Z
- foreach (12..15){$BLOCKS.=eval "&cswap_block16($_)";}
+sub sike_cswap {
+ # P[0] with Q[0]
+ foreach ( 0.. 6){$BLOCKS.=eval "&cswap_block16($_)";}
+ # P[1] with Q[1]
+ foreach ( 7..13){$BLOCKS.=eval "&cswap_block16($_)";}
my $body =<<___;
.globl ${PREFIX}_cswap_asm
@@ -254,7 +498,8 @@
___
($body)
}
-$code.=&cswap();
+$code.=&sike_cswap();
+
# Field subtraction
# Operation: c [rdx] = a [rdi] - b [rsi]
@@ -272,71 +517,58 @@
push %r14
.cfi_adjust_cfa_offset 8
.cfi_offset r14, -32
- push %r15
-.cfi_adjust_cfa_offset 8
-.cfi_offset r15, -40
xor %rax, %rax
- mov 0x0(%rdi), %r8
- mov 0x8(%rdi), %r9
- mov 0x10(%rdi), %r10
- mov 0x18(%rdi), %r11
- mov 0x20(%rdi), %r12
- mov 0x28(%rdi), %r13
- mov 0x30(%rdi), %r14
- mov 0x38(%rdi), %r15
+ mov 0x0(%rdi), %r8
+ sub 0x0(%rsi), %r8
+ mov 0x8(%rdi), %r9
+ sbb 0x8(%rsi), %r9
+ mov 0x10(%rdi), %r10
+ sbb 0x10(%rsi), %r10
+ mov 0x18(%rdi), %r11
+ sbb 0x18(%rsi), %r11
+ mov 0x20(%rdi), %r12
+ sbb 0x20(%rsi), %r12
+ mov 0x28(%rdi), %r13
+ sbb 0x28(%rsi), %r13
+ mov 0x30(%rdi), %r14
+ sbb 0x30(%rsi), %r14
- sub 0x0(%rsi), %r8
- sbb 0x8(%rsi), %r9
- sbb 0x10(%rsi), %r10
- sbb 0x18(%rsi), %r11
- sbb 0x20(%rsi), %r12
- sbb 0x28(%rsi), %r13
- sbb 0x30(%rsi), %r14
- sbb 0x38(%rsi), %r15
- sbb \$0x0, %rax
+ sbb \$0x0, %rax
- mov .Lp503x2(%rip), %rdi
- and %rax, %rdi
- mov 0x8+.Lp503x2(%rip), %rsi
- and %rax, %rsi
- mov 0x10+.Lp503x2(%rip), %rcx
- and %rax, %rcx
+ mov .Lp434x2(%rip), %rdi
+ and %rax, %rdi
+ mov 0x08+.Lp434x2(%rip), %rsi
+ and %rax, %rsi
+ mov 0x10+.Lp434x2(%rip), %rcx
+ and %rax, %rcx
- add %rdi, %r8
- adc %rsi, %r9
- adc %rsi, %r10
- adc %rcx, %r11
- mov %r8, 0x0(%rdx)
- mov %r9, 0x8(%rdx)
- mov %r10, 0x10(%rdx)
- mov %r11, 0x18(%rdx)
+ add %rdi, %r8
+ mov %r8, 0x0(%rdx)
+ adc %rsi, %r9
+ mov %r9, 0x8(%rdx)
+ adc %rsi, %r10
+ mov %r10, 0x10(%rdx)
+ adc %rcx, %r11
+ mov %r11, 0x18(%rdx)
- setc %cl
+ setc %cl
+ mov 0x18+.Lp434x2(%rip), %r8
+ and %rax, %r8
+ mov 0x20+.Lp434x2(%rip), %r9
+ and %rax, %r9
+ mov 0x28+.Lp434x2(%rip), %r10
+ and %rax, %r10
+ bt \$0x0, %rcx
- mov 0x18+.Lp503x2(%rip), %r8
- and %rax, %r8
- mov 0x20+.Lp503x2(%rip), %r9
- and %rax, %r9
- mov 0x28+.Lp503x2(%rip), %r10
- and %rax, %r10
- mov 0x30+.Lp503x2(%rip), %r11
- and %rax, %r11
+ adc %r8, %r12
+ adc %r9, %r13
+ adc %r10, %r14
+ mov %r12, 0x20(%rdx)
+ mov %r13, 0x28(%rdx)
+ mov %r14, 0x30(%rdx)
- bt \$0x0, %rcx
-
- adc %r8, %r12
- adc %r9, %r13
- adc %r10, %r14
- adc %r11, %r15
- mov %r12, 0x20(%rdx)
- mov %r13, 0x28(%rdx)
- mov %r14, 0x30(%rdx)
- mov %r15, 0x38(%rdx)
-
- pop %r15
-.cfi_adjust_cfa_offset -8
pop %r14
.cfi_adjust_cfa_offset -8
pop %r13
@@ -347,43 +579,40 @@
.cfi_endproc
___
-# 503-bit multiprecision addition
+# 434-bit multiprecision addition
# Operation: c [rdx] = a [rdi] + b [rsi]
$code.=<<___;
.globl ${PREFIX}_mpadd_asm
.type ${PREFIX}_mpadd_asm,\@function,3
${PREFIX}_mpadd_asm:
.cfi_startproc
- mov 0x0(%rdi), %r8
- mov 0x8(%rdi), %r9
- mov 0x10(%rdi), %r10
- mov 0x18(%rdi), %r11
- add 0x0(%rsi), %r8
- adc 0x8(%rsi), %r9
- adc 0x10(%rsi), %r10
- adc 0x18(%rsi), %r11
- mov %r8, 0x0(%rdx)
- mov %r9, 0x8(%rdx)
- mov %r10, 0x10(%rdx)
- mov %r11, 0x18(%rdx)
+ mov 0x0(%rdi), %r8;
+ mov 0x8(%rdi), %r9
+ mov 0x10(%rdi), %r10
+ mov 0x18(%rdi), %r11
+ mov 0x20(%rdi), %rcx
+ add 0x0(%rsi), %r8
+ adc 0x8(%rsi), %r9
+ adc 0x10(%rsi), %r10
+ adc 0x18(%rsi), %r11
+ adc 0x20(%rsi), %rcx
+ mov %r8, 0x0(%rdx)
+ mov %r9, 0x8(%rdx)
+ mov %r10, 0x10(%rdx)
+ mov %r11, 0x18(%rdx)
+ mov %rcx, 0x20(%rdx)
- mov 0x20(%rdi), %r8
- mov 0x28(%rdi), %r9
- mov 0x30(%rdi), %r10
- mov 0x38(%rdi), %r11
- adc 0x20(%rsi), %r8
- adc 0x28(%rsi), %r9
- adc 0x30(%rsi), %r10
- adc 0x38(%rsi), %r11
- mov %r8, 0x20(%rdx)
- mov %r9, 0x28(%rdx)
- mov %r10, 0x30(%rdx)
- mov %r11, 0x38(%rdx)
+ mov 0x28(%rdi), %r8
+ mov 0x30(%rdi), %r9
+ adc 0x28(%rsi), %r8
+ adc 0x30(%rsi), %r9
+ mov %r8, 0x28(%rdx)
+ mov %r9, 0x30(%rdx)
ret
.cfi_endproc
___
-# 2x503-bit multiprecision subtraction
+# 2x434-bit multiprecision subtraction
# Operation: c [rdx] = a [rdi] - b [rsi].
# Returns borrow mask
$code.=<<___;
@@ -391,65 +620,59 @@
.type ${PREFIX}_mpsubx2_asm,\@function,3
${PREFIX}_mpsubx2_asm:
.cfi_startproc
- xor %rax, %rax
+ xor %rax, %rax
- mov 0x0(%rdi), %r8
- mov 0x8(%rdi), %r9
- mov 0x10(%rdi), %r10
- mov 0x18(%rdi), %r11
- mov 0x20(%rdi), %rcx
- sub 0x0(%rsi), %r8
- sbb 0x8(%rsi), %r9
- sbb 0x10(%rsi), %r10
- sbb 0x18(%rsi), %r11
- sbb 0x20(%rsi), %rcx
- mov %r8, 0x0(%rdx)
- mov %r9, 0x8(%rdx)
- mov %r10, 0x10(%rdx)
- mov %r11, 0x18(%rdx)
- mov %rcx, 0x20(%rdx)
+ mov 0x0(%rdi), %r8
+ mov 0x8(%rdi), %r9
+ mov 0x10(%rdi), %r10
+ mov 0x18(%rdi), %r11
+ mov 0x20(%rdi), %rcx
+ sub 0x0(%rsi), %r8
+ sbb 0x8(%rsi), %r9
+ sbb 0x10(%rsi), %r10
+ sbb 0x18(%rsi), %r11
+ sbb 0x20(%rsi), %rcx
+ mov %r8, 0x0(%rdx)
+ mov %r9, 0x8(%rdx)
+ mov %r10, 0x10(%rdx)
+ mov %r11, 0x18(%rdx)
+ mov %rcx, 0x20(%rdx)
- mov 0x28(%rdi), %r8
- mov 0x30(%rdi), %r9
- mov 0x38(%rdi), %r10
- mov 0x40(%rdi), %r11
- mov 0x48(%rdi), %rcx
- sbb 0x28(%rsi), %r8
- sbb 0x30(%rsi), %r9
- sbb 0x38(%rsi), %r10
- sbb 0x40(%rsi), %r11
- sbb 0x48(%rsi), %rcx
- mov %r8, 0x28(%rdx)
- mov %r9, 0x30(%rdx)
- mov %r10, 0x38(%rdx)
- mov %r11, 0x40(%rdx)
- mov %rcx, 0x48(%rdx)
+ mov 0x28(%rdi), %r8
+ mov 0x30(%rdi), %r9
+ mov 0x38(%rdi), %r10
+ mov 0x40(%rdi), %r11
+ mov 0x48(%rdi), %rcx
+ sbb 0x28(%rsi), %r8
+ sbb 0x30(%rsi), %r9
+ sbb 0x38(%rsi), %r10
+ sbb 0x40(%rsi), %r11
+ sbb 0x48(%rsi), %rcx
+ mov %r8, 0x28(%rdx)
+ mov %r9, 0x30(%rdx)
+ mov %r10, 0x38(%rdx)
+ mov %r11, 0x40(%rdx)
+ mov %rcx, 0x48(%rdx)
- mov 0x50(%rdi), %r8
- mov 0x58(%rdi), %r9
- mov 0x60(%rdi), %r10
- mov 0x68(%rdi), %r11
- mov 0x70(%rdi), %rcx
- sbb 0x50(%rsi), %r8
- sbb 0x58(%rsi), %r9
- sbb 0x60(%rsi), %r10
- sbb 0x68(%rsi), %r11
- sbb 0x70(%rsi), %rcx
- mov %r8, 0x50(%rdx)
- mov %r9, 0x58(%rdx)
- mov %r10, 0x60(%rdx)
- mov %r11, 0x68(%rdx)
- mov %rcx, 0x70(%rdx)
-
- mov 0x78(%rdi), %r8
- sbb 0x78(%rsi), %r8
- sbb \$0x0, %rax
- mov %r8, 0x78(%rdx)
+ mov 0x50(%rdi), %r8
+ mov 0x58(%rdi), %r9
+ mov 0x60(%rdi), %r10
+ mov 0x68(%rdi), %r11
+ sbb 0x50(%rsi), %r8
+ sbb 0x58(%rsi), %r9
+ sbb 0x60(%rsi), %r10
+ sbb 0x68(%rsi), %r11
+ sbb \$0x0, %rax
+ mov %r8, 0x50(%rdx)
+ mov %r9, 0x58(%rdx)
+ mov %r10, 0x60(%rdx)
+ mov %r11, 0x68(%rdx)
ret
.cfi_endproc
___
-# Double 2x503-bit multiprecision subtraction
+
+# Double 2x434-bit multiprecision subtraction
# Operation: c [rdx] = c [rdx] - a [rdi] - b [rsi]
$code.=<<___;
.globl ${PREFIX}_mpdblsubx2_asm
@@ -462,87 +685,81 @@
push %r13
.cfi_adjust_cfa_offset 8
.cfi_offset r13, -24
- push %r14
-.cfi_adjust_cfa_offset 8
-.cfi_offset r14, -32
- xor %rax, %rax
+ xor %rax, %rax
- mov 0x0(%rdx), %r8
- mov 0x8(%rdx), %r9
- mov 0x10(%rdx), %r10
- mov 0x18(%rdx), %r11
- mov 0x20(%rdx), %r12
- mov 0x28(%rdx), %r13
- mov 0x30(%rdx), %r14
- mov 0x38(%rdx), %rcx
- sub 0x0(%rdi), %r8
- sbb 0x8(%rdi), %r9
- sbb 0x10(%rdi), %r10
- sbb 0x18(%rdi), %r11
- sbb 0x20(%rdi), %r12
- sbb 0x28(%rdi), %r13
- sbb 0x30(%rdi), %r14
- sbb 0x38(%rdi), %rcx
- adc \$0x0, %rax
+ # ci:low = c:low - a:low
+ mov 0x0(%rdx), %r8
+ mov 0x8(%rdx), %r9
+ mov 0x10(%rdx), %r10
+ mov 0x18(%rdx), %r11
+ mov 0x20(%rdx), %r12
+ mov 0x28(%rdx), %r13
+ mov 0x30(%rdx), %rcx
+ sub 0x0(%rdi), %r8
+ sbb 0x8(%rdi), %r9
+ sbb 0x10(%rdi), %r10
+ sbb 0x18(%rdi), %r11
+ sbb 0x20(%rdi), %r12
+ sbb 0x28(%rdi), %r13
+ sbb 0x30(%rdi), %rcx
+ adc \$0x0, %rax
- sub 0x0(%rsi), %r8
- sbb 0x8(%rsi), %r9
- sbb 0x10(%rsi), %r10
- sbb 0x18(%rsi), %r11
- sbb 0x20(%rsi), %r12
- sbb 0x28(%rsi), %r13
- sbb 0x30(%rsi), %r14
- sbb 0x38(%rsi), %rcx
- adc \$0x0, %rax
+ # c:low = ci:low - b:low
+ sub 0x0(%rsi), %r8
+ sbb 0x8(%rsi), %r9
+ sbb 0x10(%rsi), %r10
+ sbb 0x18(%rsi), %r11
+ sbb 0x20(%rsi), %r12
+ sbb 0x28(%rsi), %r13
+ sbb 0x30(%rsi), %rcx
+ adc \$0x0, %rax
- mov %r8, 0x0(%rdx)
- mov %r9, 0x8(%rdx)
- mov %r10, 0x10(%rdx)
- mov %r11, 0x18(%rdx)
- mov %r12, 0x20(%rdx)
- mov %r13, 0x28(%rdx)
- mov %r14, 0x30(%rdx)
- mov %rcx, 0x38(%rdx)
+ # store c:low
+ mov %r8, 0x0(%rdx)
+ mov %r9, 0x8(%rdx)
+ mov %r10, 0x10(%rdx)
+ mov %r11, 0x18(%rdx)
+ mov %r12, 0x20(%rdx)
+ mov %r13, 0x28(%rdx)
+ mov %rcx, 0x30(%rdx)
- mov 0x40(%rdx), %r8
- mov 0x48(%rdx), %r9
- mov 0x50(%rdx), %r10
- mov 0x58(%rdx), %r11
- mov 0x60(%rdx), %r12
- mov 0x68(%rdx), %r13
- mov 0x70(%rdx), %r14
- mov 0x78(%rdx), %rcx
+ # ci:high = c:high - a:high
+ mov 0x38(%rdx), %r8
+ mov 0x40(%rdx), %r9
+ mov 0x48(%rdx), %r10
+ mov 0x50(%rdx), %r11
+ mov 0x58(%rdx), %r12
+ mov 0x60(%rdx), %r13
+ mov 0x68(%rdx), %rcx
- sub %rax, %r8
- sbb 0x40(%rdi), %r8
- sbb 0x48(%rdi), %r9
- sbb 0x50(%rdi), %r10
- sbb 0x58(%rdi), %r11
- sbb 0x60(%rdi), %r12
- sbb 0x68(%rdi), %r13
- sbb 0x70(%rdi), %r14
- sbb 0x78(%rdi), %rcx
- sub 0x40(%rsi), %r8
- sbb 0x48(%rsi), %r9
- sbb 0x50(%rsi), %r10
- sbb 0x58(%rsi), %r11
- sbb 0x60(%rsi), %r12
- sbb 0x68(%rsi), %r13
- sbb 0x70(%rsi), %r14
- sbb 0x78(%rsi), %rcx
+ sub %rax, %r8
+ sbb 0x38(%rdi), %r8
+ sbb 0x40(%rdi), %r9
+ sbb 0x48(%rdi), %r10
+ sbb 0x50(%rdi), %r11
+ sbb 0x58(%rdi), %r12
+ sbb 0x60(%rdi), %r13
+ sbb 0x68(%rdi), %rcx
- mov %r8, 0x40(%rdx)
- mov %r9, 0x48(%rdx)
- mov %r10, 0x50(%rdx)
- mov %r11, 0x58(%rdx)
- mov %r12, 0x60(%rdx)
- mov %r13, 0x68(%rdx)
- mov %r14, 0x70(%rdx)
- mov %rcx, 0x78(%rdx)
+ # c:high = ci:high - b:high
+ sub 0x38(%rsi), %r8
+ sbb 0x40(%rsi), %r9
+ sbb 0x48(%rsi), %r10
+ sbb 0x50(%rsi), %r11
+ sbb 0x58(%rsi), %r12
+ sbb 0x60(%rsi), %r13
+ sbb 0x68(%rsi), %rcx
- pop %r14
-.cfi_adjust_cfa_offset -8
+ # store c:high
+ mov %r8, 0x38(%rdx)
+ mov %r9, 0x40(%rdx)
+ mov %r10, 0x48(%rdx)
+ mov %r11, 0x50(%rdx)
+ mov %r12, 0x58(%rdx)
+ mov %r13, 0x60(%rdx)
+ mov %rcx, 0x68(%rdx)
+
pop %r13
.cfi_adjust_cfa_offset -8
pop %r12
@@ -552,117 +769,212 @@
___
-# Performs schoolbook multiplication of 2 256-bit numbers. Uses
-# MULX instruction. Result is stored in 256 bits pointed by $DST.
-sub mul256_school {
- my ($idxM0,$M0,$idxM1,$M1,$idxDST,$DST,$T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7,$T8,$T9)=@_;
- my ($ML0,$ML8,$ML16,$ML24)=map("$idxM0+$_(%$M0)",(0,8,16,24));
- my ($MR0,$MR8,$MR16,$MR24)=map("$idxM1+$_(%$M1)",(0,8,16,24));
- my ($D0,$D1,$D2,$D3,$D4,$D5,$D6,$D7)=map("$idxDST+$_(%$DST)",(0,8,16,24,32,40,48,56));
+sub redc_common {
+ my ($mul01, $mul23, $mul45, $mul67)=@_;
+ my $body=<<___;
+ $mul01
+ xor %rcx, %rcx
+ add 0x18(%rdi), %r8
+ adc 0x20(%rdi), %r9
+ adc 0x28(%rdi), %r10
+ adc 0x30(%rdi), %r11
+ adc 0x38(%rdi), %r12
+ adc 0x40(%rdi), %r13
+ adc 0x48(%rdi), %rcx
+ mov %r8, 0x18(%rdi)
+ mov %r9, 0x20(%rdi)
+ mov %r10, 0x28(%rdi)
+ mov %r11, 0x30(%rdi)
+ mov %r12, 0x38(%rdi)
+ mov %r13, 0x40(%rdi)
+ mov %rcx, 0x48(%rdi)
+ mov 0x50(%rdi), %r8
+ mov 0x58(%rdi), %r9
+ mov 0x60(%rdi), %r10
+ mov 0x68(%rdi), %r11
+ adc \$0x0, %r8
+ adc \$0x0, %r9
+ adc \$0x0, %r10
+ adc \$0x0, %r11
+ mov %r8, 0x50(%rdi)
+ mov %r9, 0x58(%rdi)
+ mov %r10, 0x60(%rdi)
+ mov %r11, 0x68(%rdi)
- $body=<<___;
- mov $ML0, %rdx
- mulx $MR0, %$T1, %$T0 # T0:T1 = A0*B0
- mov %$T1, $D0 # DST0_final
- mulx $MR8, %$T2, %$T1 # T1:T2 = A0*B1
- xor %rax, %rax
- adox %$T2, %$T0
- mulx $MR16,%$T3, %$T2 # T2:T3 = A0*B2
- adox %$T3, %$T1
- mulx $MR24,%$T4, %$T3 # T3:T4 = A0*B3
- adox %$T4, %$T2
+ $mul23
+ xor %rcx, %rcx
+ add 0x28(%rdi), %r8
+ adc 0x30(%rdi), %r9
+ adc 0x38(%rdi), %r10
+ adc 0x40(%rdi), %r11
+ adc 0x48(%rdi), %r12
+ adc 0x50(%rdi), %r13
+ adc 0x58(%rdi), %rcx
+ mov %r8, 0x28(%rdi)
+ mov %r9, 0x30(%rdi)
+ mov %r10, 0x38(%rdi)
+ mov %r11, 0x40(%rdi)
+ mov %r12, 0x48(%rdi)
+ mov %r13, 0x50(%rdi)
+ mov %rcx, 0x58(%rdi)
+ mov 0x60(%rdi), %r8
+ mov 0x68(%rdi), %r9
+ adc \$0x0, %r8
+ adc \$0x0, %r9
+ mov %r8, 0x60(%rdi)
+ mov %r9, 0x68(%rdi)
- mov $ML8, %rdx
- mulx $MR0, %$T4, %$T5 # T5:T4 = A1*B0
- adox %rax, %$T3
- xor %rax, %rax
- mulx $MR8, %$T7, %$T6 # T6:T7 = A1*B1
- adox %$T0, %$T4
- mov %$T4, $D1 # DST1_final
- adcx %$T7, %$T5
- mulx $MR16,%$T8, %$T7 # T7:T8 = A1*B2
- adcx %$T8, %$T6
- adox %$T1, %$T5
- mulx $MR24,%$T9, %$T8 # T8:T9 = A1*B3
- adcx %$T9, %$T7
- adcx %rax, %$T8
- adox %$T2, %$T6
+ $mul45
+ xor %rcx, %rcx
+ add 0x38(%rdi), %r8
+ adc 0x40(%rdi), %r9
+ adc 0x48(%rdi), %r10
+ adc 0x50(%rdi), %r11
+ adc 0x58(%rdi), %r12
+ adc 0x60(%rdi), %r13
+ adc 0x68(%rdi), %rcx
+ mov %r8, 0x0(%rsi) # C0
+ mov %r9, 0x8(%rsi) # C1
+ mov %r10, 0x48(%rdi)
+ mov %r11, 0x50(%rdi)
+ mov %r12, 0x58(%rdi)
+ mov %r13, 0x60(%rdi)
+ mov %rcx, 0x68(%rdi)
- mov $ML16,%rdx
- mulx $MR0, %$T0, %$T1 # T1:T0 = A2*B0
- adox %$T3, %$T7
- adox %rax, %$T8
- xor %rax, %rax
- mulx $MR8, %$T3, %$T2 # T2:T3 = A2*B1
- adox %$T5, %$T0
- mov %$T0, $D2 # DST2_final
- adcx %$T3, %$T1
- mulx $MR16,%$T4, %$T3 # T3:T4 = A2*B2
- adcx %$T4, %$T2
- adox %$T6, %$T1
- mulx $MR24,%$T9, %$T4 # T3:T4 = A2*B3
- adcx %$T9, %$T3
-
- adcx %rax, %$T4
- adox %$T7, %$T2
- adox %$T8, %$T3
- adox %rax, %$T4
-
- mov $ML24, %rdx
- mulx $MR0, %$T0, %$T5 # T5:T0 = A3*B0
- xor %rax, %rax
- mulx $MR8, %$T7, %$T6 # T6:T7 = A3*B1
- adcx %$T7, %$T5
- adox %$T0, %$T1
- mulx $MR16, %$T8, %$T7 # T7:T8 = A3*B2
- adcx %$T8, %$T6
- adox %$T5, %$T2
- mulx $MR24, %$T9, %$T8 # T8:T9 = A3*B3
- adcx %$T9, %$T7
- adcx %rax, %$T8
- adox %$T6, %$T3
- adox %$T7, %$T4
- adox %rax, %$T8
- mov %$T1, $D3 # DST3_final
- mov %$T2, $D4 # DST4_final
- mov %$T3, $D5 # DST5_final
- mov %$T4, $D6 # DST6_final
- mov %$T8, $D7 # DST7_final
-
+ $mul67
+ add 0x48(%rdi), %r8
+ adc 0x50(%rdi), %r9
+ adc 0x58(%rdi), %r10
+ adc 0x60(%rdi), %r11
+ adc 0x68(%rdi), %r12
+ mov %r8, 0x10(%rsi) # C2
+ mov %r9, 0x18(%rsi) # C3
+ mov %r10, 0x20(%rsi) # C4
+ mov %r11, 0x28(%rsi) # C5
+ mov %r12, 0x30(%rsi) # C6
___
return $body;
}
-# 503-bit multiplication using Karatsuba (one level),
-# schoolbook (one level).
-sub mul_mulx {
- # [rcx+64] <- (AH+AL) x (BH+BL)
- my $mul256_low=&mul256_school(0,"rsp",32,"rsp",64,"rcx",map("r$_",(8..15)),"rbx","rbp");
+# Optimized Montgomery reduction for CPUs, based on method described
+# in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015.
+# Operation: c [rsi] = a [rdi]
+# NOTE: a=c is not allowed
+sub sike_rdc {
+ my $jump_redc_bdw=&alt_impl(".Lrdc_bdw") if ($bmi2_adx);
+ # a[0-1] x .Lp434p1 --> result: r8:r13
+ my $mulx1=&mulx128x256( 0,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..13)),"%rcx");
+ # a[2-3] x .Lp434p1 --> result: r8:r13
+ my $mulx2=&mulx128x256(16,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..13)),"%rcx");
+ # a[4-5] x .Lp434p1 --> result: r8:r13
+ my $mulx3=&mulx128x256(32,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..13)),"%rcx");
+ # a[6-7] x .Lp434p1 --> result: r8:r13
+ my $mulx4=&mulx64x256( 48,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..13)));
+
+ # a[0-1] x .Lp434p1 --> result: r8:r13
+ my $mul1=&mul128x256( 0,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..14)),"%rcx");
+ # a[2-3] x .Lp434p1 --> result: r8:r13
+ my $mul2=&mul128x256(16,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..14)),"%rcx");
+ # a[4-5] x .Lp434p1 --> result: r8:r13
+ my $mul3=&mul128x256(32,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..14)),"%rcx");
+ # a[6-7] x .Lp434p1 --> result: r8:r13
+ my $mul4=&mul64x256( 48,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..13)));
+
+ my $redc_mul=&redc_common($mul1, $mul2, $mul3, $mul4);
+ my $redc_bdw=&redc_common($mulx1, $mulx2, $mulx3, $mulx4) if ($bmi2_adx);
+
+ # REDC for Broadwell CPUs
+ my $code=<<___;
+ .Lrdc_bdw:
+ .cfi_startproc
+ # sike_fprdc has already pushed r12--15 by this point.
+ .cfi_adjust_cfa_offset 32
+ .cfi_offset r12, -16
+ .cfi_offset r13, -24
+ .cfi_offset r14, -32
+ .cfi_offset r15, -40
+
+ $redc_bdw
+
+ pop %r15
+ .cfi_adjust_cfa_offset -8
+ .cfi_same_value r15
+ pop %r14
+ .cfi_adjust_cfa_offset -8
+ .cfi_same_value r14
+ pop %r13
+ .cfi_adjust_cfa_offset -8
+ .cfi_same_value r13
+ pop %r12
+ .cfi_adjust_cfa_offset -8
+ .cfi_same_value r12
+ ret
+ .cfi_endproc
+___
+
+ # REDC for CPUs older than Broadwell
+ $code.=<<___;
+ .globl ${PREFIX}_fprdc
+ .type ${PREFIX}_fprdc,\@function,3
+ ${PREFIX}_fprdc:
+ .cfi_startproc
+ push %r12
+ .cfi_adjust_cfa_offset 8
+ .cfi_offset r12, -16
+ push %r13
+ .cfi_adjust_cfa_offset 8
+ .cfi_offset r13, -24
+ push %r14
+ .cfi_adjust_cfa_offset 8
+ .cfi_offset r14, -32
+ push %r15
+ .cfi_adjust_cfa_offset 8
+ .cfi_offset r15, -40
+
+ # Jump to optimized implementation if
+ # CPU supports ADCX/ADOX/MULX
+ $jump_redc_bdw
+ # Otherwise use generic implementation
+ $redc_mul
+
+ pop %r15
+ .cfi_adjust_cfa_offset -8
+ pop %r14
+ .cfi_adjust_cfa_offset -8
+ pop %r13
+ .cfi_adjust_cfa_offset -8
+ pop %r12
+ .cfi_adjust_cfa_offset -8
+ ret
+ .cfi_endproc
+___
+ return $code;
+}
+$code.=&sike_rdc();
+
+# 434-bit multiplication using Karatsuba (one level),
+# schoolbook (one level). Uses MULX/ADOX/ADCX instructions
+# available on Broadwell micro-architectures and newer.
+sub mul_bdw {
+ # [rsp] <- (AH+AL) x (BH+BL)
+ my $mul256_low=&mul256(0,"%rsp",32,"%rsp",0,"%rsp",map("%r$_",(8..15)),"%rbx","%rbp");
# [rcx] <- AL x BL
- my $mul256_albl=&mul256_school(0,"rdi",0,"rsi",0,"rcx",map("r$_",(8..15)),"rbx","rbp");
- # [rsp] <- AH x BH
- my $mul256_ahbh=&mul256_school(32,"rdi",32,"rsi",0,"rsp",map("r$_",(8..15)),"rbx","rbp");
+ my $mul256_albl=&mul256(0,"%rdi",0,"%rsi",0,"%rcx",map("%r$_",(8..15)),"%rbx","%rbp");
+ # [rcx+64] <- AH x BH
+ my $mul192_ahbh=&mul192(32,"%rdi",32,"%rsi",64,"%rcx",map("%r$_",(8..14)));
$body=<<___;
- .Lmul_mulx:
- .cfi_startproc
- # sike_mpmul has already pushed r12--15 by this point.
- .cfi_adjust_cfa_offset 32
- .cfi_offset r12, -16
- .cfi_offset r13, -24
- .cfi_offset r14, -32
- .cfi_offset r15, -40
- mov %rdx, %rcx
+ mov %rdx, %rcx
+ xor %rax, %rax
# r8-r11 <- AH + AL, rax <- mask
- xor %rax, %rax
- mov (%rdi), %r8
- mov 0x8(%rdi), %r9
- mov 0x10(%rdi), %r10
- mov 0x18(%rdi), %r11
- push %rbx
+ mov 0x0(%rdi), %r8
+ mov 0x8(%rdi), %r9
+ mov 0x10(%rdi), %r10
+ mov 0x18(%rdi), %r11
+ push %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset rbx, -48
push %rbp
@@ -670,131 +982,123 @@
.cfi_adjust_cfa_offset 8
sub \$96, %rsp
.cfi_adjust_cfa_offset 96
- add 0x20(%rdi), %r8
- adc 0x28(%rdi), %r9
- adc 0x30(%rdi), %r10
- adc 0x38(%rdi), %r11
- sbb \$0x0, %rax
- mov %r8, (%rsp)
- mov %r9, 0x8(%rsp)
- mov %r10, 0x10(%rsp)
- mov %r11, 0x18(%rsp)
+
+ add 0x20(%rdi), %r8
+ adc 0x28(%rdi), %r9
+ adc 0x30(%rdi), %r10
+ adc \$0x0, %r11
+ sbb \$0x0, %rax
+ mov %r8, 0x0(%rsp)
+ mov %r9, 0x8(%rsp)
+ mov %r10, 0x10(%rsp)
+ mov %r11, 0x18(%rsp)
# r12-r15 <- BH + BL, rbx <- mask
- xor %rbx, %rbx
- mov (%rsi), %r12
- mov 0x8(%rsi), %r13
- mov 0x10(%rsi), %r14
- mov 0x18(%rsi), %r15
- add 0x20(%rsi), %r12
- adc 0x28(%rsi), %r13
- adc 0x30(%rsi), %r14
- adc 0x38(%rsi), %r15
- sbb \$0x0, %rbx
- mov %r12, 0x20(%rsp)
- mov %r13, 0x28(%rsp)
- mov %r14, 0x30(%rsp)
- mov %r15, 0x38(%rsp)
+ xor %rbx, %rbx
+ mov 0x0(%rsi), %r12
+ mov 0x8(%rsi), %r13
+ mov 0x10(%rsi), %r14
+ mov 0x18(%rsi), %r15
+ add 0x20(%rsi), %r12
+ adc 0x28(%rsi), %r13
+ adc 0x30(%rsi), %r14
+ adc \$0x0, %r15
+ sbb \$0x0, %rbx
+ mov %r12, 0x20(%rsp)
+ mov %r13, 0x28(%rsp)
+ mov %r14, 0x30(%rsp)
+ mov %r15, 0x38(%rsp)
# r12-r15 <- masked (BH + BL)
- and %rax, %r12
- and %rax, %r13
- and %rax, %r14
- and %rax, %r15
+ and %rax, %r12
+ and %rax, %r13
+ and %rax, %r14
+ and %rax, %r15
# r8-r11 <- masked (AH + AL)
- and %rbx, %r8
- and %rbx, %r9
- and %rbx, %r10
- and %rbx, %r11
+ and %rbx, %r8
+ and %rbx, %r9
+ and %rbx, %r10
+ and %rbx, %r11
- # r8-r11 <- masked (AH + AL) + masked (AH + AL)
- add %r12, %r8
- adc %r13, %r9
- adc %r14, %r10
- adc %r15, %r11
- mov %r8, 0x40(%rsp)
- mov %r9, 0x48(%rsp)
- mov %r10, 0x50(%rsp)
- mov %r11, 0x58(%rsp)
+ # r8-r11 <- masked (AH + AL) + masked (BH + BL)
+ add %r12, %r8
+ adc %r13, %r9
+ adc %r14, %r10
+ adc %r15, %r11
+ mov %r8, 0x40(%rsp)
+ mov %r9, 0x48(%rsp)
+ mov %r10, 0x50(%rsp)
+ mov %r11, 0x58(%rsp)
- # [rcx+64] <- (AH+AL) x (BH+BL)
+ # [rsp] <- CM = (AH+AL) x (BH+BL)
$mul256_low
- # [rcx] <- AL x BL (Result c0-c3)
+ # [rcx] <- CL = AL x BL (Result c0-c3)
$mul256_albl
- # [rsp] <- AH x BH
- $mul256_ahbh
+ # [rcx+64] <- CH = AH x BH
+ $mul192_ahbh
# r8-r11 <- (AH+AL) x (BH+BL), final step
- mov 0x40(%rsp), %r8
- mov 0x48(%rsp), %r9
- mov 0x50(%rsp), %r10
- mov 0x58(%rsp), %r11
- mov 0x60(%rcx), %rax
- add %rax, %r8
- mov 0x68(%rcx), %rax
- adc %rax, %r9
- mov 0x70(%rcx), %rax
- adc %rax, %r10
- mov 0x78(%rcx), %rax
- adc %rax, %r11
+ mov 0x40(%rsp), %r8
+ mov 0x48(%rsp), %r9
+ mov 0x50(%rsp), %r10
+ mov 0x58(%rsp), %r11
- # [rcx+64], x3-x5 <- (AH+AL) x (BH+BL) - ALxBL
- mov 0x40(%rcx), %r12
- mov 0x48(%rcx), %r13
- mov 0x50(%rcx), %r14
- mov 0x58(%rcx), %r15
- sub (%rcx), %r12
- sbb 0x8(%rcx), %r13
- sbb 0x10(%rcx), %r14
- sbb 0x18(%rcx), %r15
- sbb 0x20(%rcx), %r8
- sbb 0x28(%rcx), %r9
- sbb 0x30(%rcx), %r10
- sbb 0x38(%rcx), %r11
+ mov 0x20(%rsp), %rax
+ add %rax, %r8
+ mov 0x28(%rsp), %rax
+ adc %rax, %r9
+ mov 0x30(%rsp), %rax
+ adc %rax, %r10
+ mov 0x38(%rsp), %rax
+ adc %rax, %r11
+
+ # [rsp], x3-x5 <- (AH+AL) x (BH+BL) - ALxBL
+ mov 0x0(%rsp), %r12
+ mov 0x8(%rsp), %r13
+ mov 0x10(%rsp), %r14
+ mov 0x18(%rsp), %r15
+ sub 0x0(%rcx), %r12
+ sbb 0x8(%rcx), %r13
+ sbb 0x10(%rcx), %r14
+ sbb 0x18(%rcx), %r15
+ sbb 0x20(%rcx), %r8
+ sbb 0x28(%rcx), %r9
+ sbb 0x30(%rcx), %r10
+ sbb 0x38(%rcx), %r11
# r8-r15 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
- sub (%rsp), %r12
- sbb 0x8(%rsp), %r13
- sbb 0x10(%rsp), %r14
- sbb 0x18(%rsp), %r15
- sbb 0x20(%rsp), %r8
- sbb 0x28(%rsp), %r9
- sbb 0x30(%rsp), %r10
- sbb 0x38(%rsp), %r11
+ sub 0x40(%rcx), %r12
+ sbb 0x48(%rcx), %r13
+ sbb 0x50(%rcx), %r14
+ sbb 0x58(%rcx), %r15
+ sbb 0x60(%rcx), %r8
+ sbb 0x68(%rcx), %r9
+ sbb \$0x0, %r10
+ sbb \$0x0, %r11
- add 0x20(%rcx), %r12
- mov %r12, 0x20(%rcx) # Result C4-C7
- adc 0x28(%rcx), %r13
- mov %r13, 0x28(%rcx)
- adc 0x30(%rcx), %r14
- mov %r14, 0x30(%rcx)
- adc 0x38(%rcx), %r15
- mov %r15, 0x38(%rcx)
- mov (%rsp), %rax
- adc %rax, %r8 # Result C8-C15
- mov %r8, 0x40(%rcx)
- mov 0x8(%rsp), %rax
- adc %rax, %r9
- mov %r9, 0x48(%rcx)
- mov 0x10(%rsp), %rax
- adc %rax, %r10
- mov %r10, 0x50(%rcx)
- mov 0x18(%rsp), %rax
- adc %rax, %r11
- mov %r11, 0x58(%rcx)
- mov 0x20(%rsp), %r12
- adc \$0x0, %r12
- mov %r12, 0x60(%rcx)
- mov 0x28(%rsp), %r13
- adc \$0x0, %r13
- mov %r13, 0x68(%rcx)
- mov 0x30(%rsp), %r14
- adc \$0x0, %r14
- mov %r14, 0x70(%rcx)
- mov 0x38(%rsp), %r15
- adc \$0x0, %r15
- mov %r15, 0x78(%rcx)
+ add 0x20(%rcx), %r12
+ mov %r12, 0x20(%rcx) # Result C4-C7
+ adc 0x28(%rcx), %r13
+ mov %r13, 0x28(%rcx)
+ adc 0x30(%rcx), %r14
+ mov %r14, 0x30(%rcx)
+ adc 0x38(%rcx), %r15
+ mov %r15, 0x38(%rcx)
+ adc 0x40(%rcx), %r8
+ mov %r8, 0x40(%rcx) # Result C8-C15
+ adc 0x48(%rcx), %r9
+ mov %r9, 0x48(%rcx)
+ adc 0x50(%rcx), %r10
+ mov %r10, 0x50(%rcx)
+ adc 0x58(%rcx), %r11
+ mov %r11, 0x58(%rcx)
+ mov 0x60(%rcx), %r12
+ adc \$0x0, %r12
+ mov %r12, 0x60(%rcx)
+ mov 0x68(%rcx), %r13
+ adc \$0x0, %r13
+ mov %r13, 0x68(%rcx)
add \$96, %rsp
.cfi_adjust_cfa_offset -96
@@ -804,6 +1108,461 @@
pop %rbx
.cfi_adjust_cfa_offset -8
.cfi_same_value rbx
+___
+ return $body;
+}
+
+# 434-bit multiplication using Karatsuba (one level),
+# schoolbook (one level).
+sub mul {
+ my $code=<<___;
+ mov %rdx, %rcx
+
+ sub \$112, %rsp # Allocating space in stack
+ .cfi_adjust_cfa_offset 112
+
+ # rcx[0-3] <- AH+AL
+ xor %rax, %rax
+ mov 0x20(%rdi), %r8
+ mov 0x28(%rdi), %r9
+ mov 0x30(%rdi), %r10
+ xor %r11, %r11
+ add 0x0(%rdi), %r8
+ adc 0x8(%rdi), %r9
+ adc 0x10(%rdi), %r10
+ adc 0x18(%rdi), %r11
+ # store AH+AL mask
+ sbb \$0, %rax
+ mov %rax, 0x40(%rsp)
+ # store AH+AL in 0-0x18(rcx)
+ mov %r8, 0x0(%rcx)
+ mov %r9, 0x8(%rcx)
+ mov %r10, 0x10(%rcx)
+ mov %r11, 0x18(%rcx)
+
+ # r12-r15 <- BH+BL
+ xor %rdx, %rdx
+ mov 0x20(%rsi), %r12
+ mov 0x28(%rsi), %r13
+ mov 0x30(%rsi), %r14
+ xor %r15, %r15
+ add 0x0(%rsi), %r12
+ adc 0x8(%rsi), %r13
+ adc 0x10(%rsi), %r14
+ adc 0x18(%rsi), %r15
+ sbb \$0x0, %rdx
+ # store BH+BL mask
+ mov %rdx, 0x48(%rsp)
+
+ # (rsp[0-0x38]) <- (AH+AL)*(BH+BL)
+ mov (%rcx), %rax
+ mul %r12
+ mov %rax, (%rsp) # c0
+ mov %rdx, %r8
+
+ xor %r9, %r9
+ mov (%rcx), %rax
+ mul %r13
+ add %rax, %r8
+ adc %rdx, %r9
+
+ xor %r10, %r10
+ mov 0x8(%rcx), %rax
+ mul %r12
+ add %rax, %r8
+ mov %r8, 0x8(%rsp) # c1
+ adc %rdx, %r9
+ adc \$0x0,%r10
+
+ xor %r8, %r8
+ mov (%rcx), %rax
+ mul %r14
+ add %rax, %r9
+ adc %rdx, %r10
+ adc \$0x0,%r8
+
+ mov 0x10(%rcx), %rax
+ mul %r12
+ add %rax, %r9
+ adc %rdx, %r10
+ adc \$0x0,%r8
+
+ mov 0x8(%rcx), %rax
+ mul %r13
+ add %rax, %r9
+ mov %r9, 0x10(%rsp) # c2
+ adc %rdx, %r10
+ adc \$0x0, %r8
+
+ xor %r9, %r9
+ mov (%rcx),%rax
+ mul %r15
+ add %rax, %r10
+ adc %rdx, %r8
+ adc \$0x0,%r9
+
+ mov 0x18(%rcx), %rax
+ mul %r12
+ add %rax, %r10
+ adc %rdx, %r8
+ adc \$0x0,%r9
+
+ mov 0x8(%rcx), %rax
+ mul %r14
+ add %rax, %r10
+ adc %rdx, %r8
+ adc \$0x0,%r9
+
+ mov 0x10(%rcx), %rax
+ mul %r13
+ add %rax, %r10
+ mov %r10, 0x18(%rsp) # c3
+ adc %rdx, %r8
+ adc \$0x0, %r9
+
+ xor %r10, %r10
+ mov 0x8(%rcx), %rax
+ mul %r15
+ add %rax, %r8
+ adc %rdx, %r9
+ adc \$0x0,%r10
+
+ mov 0x18(%rcx), %rax
+ mul %r13
+ add %rax, %r8
+ adc %rdx, %r9
+ adc \$0x0,%r10
+
+ mov 0x10(%rcx), %rax
+ mul %r14
+ add %rax, %r8 # c4
+ mov %r8, 0x20(%rsp)
+ adc %rdx, %r9
+ adc \$0x0,%r10
+
+ xor %r11, %r11
+ mov 0x10(%rcx), %rax
+ mul %r15
+ add %rax, %r9
+ adc %rdx, %r10
+ adc \$0x0,%r11
+
+ mov 0x18(%rcx), %rax
+ mul %r14
+ add %rax, %r9 # c5
+ mov %r9, 0x28(%rsp)
+ adc %rdx, %r10
+ adc \$0x0,%r11
+
+ mov 0x18(%rcx), %rax
+ mul %r15
+ add %rax, %r10 # c6
+ mov %r10, 0x30(%rsp)
+ adc %rdx, %r11 # c7
+ mov %r11, 0x38(%rsp)
+
+ # r12-r15 <- masked (BH + BL)
+ mov 0x40(%rsp), %rax
+ and %rax, %r12
+ and %rax, %r13
+ and %rax, %r14
+ and %rax, %r15
+
+ # r8-r11 <- masked (AH + AL)
+ mov 0x48(%rsp),%rax
+ mov 0x00(rcx), %r8
+ and %rax, %r8
+ mov 0x08(rcx), %r9
+ and %rax, %r9
+ mov 0x10(rcx), %r10
+ and %rax, %r10
+ mov 0x18(rcx), %r11
+ and %rax, %r11
+
+ # r12-r15 <- masked (AH + AL) + masked (BH + BL)
+ add %r8, %r12
+ adc %r9, %r13
+ adc %r10, %r14
+ adc %r11, %r15
+
+ # rsp[0x20-0x38] <- (AH+AL) x (BH+BL) high
+ mov 0x20(%rsp), %rax
+ add %rax, %r12
+ mov 0x28(%rsp), %rax
+ adc %rax, %r13
+ mov 0x30(%rsp), %rax
+ adc %rax, %r14
+ mov 0x38(%rsp), %rax
+ adc %rax, %r15
+ mov %r12, 0x50(%rsp)
+ mov %r13, 0x58(%rsp)
+ mov %r14, 0x60(%rsp)
+ mov %r15, 0x68(%rsp)
+
+ # [rcx] <- CL = AL x BL
+ mov (%rdi), %r11
+ mov (%rsi), %rax
+ mul %r11
+ xor %r9, %r9
+ mov %rax, (%rcx) # c0
+ mov %rdx, %r8
+
+ mov 0x10(%rdi), %r14
+ mov 0x8(%rsi), %rax
+ mul %r11
+ xor %r10, %r10
+ add %rax, %r8
+ adc %rdx, %r9
+
+ mov 0x8(%rdi), %r12
+ mov (%rsi), %rax
+ mul %r12
+ add %rax, %r8
+ mov %r8, 0x8(%rcx) # c1
+ adc %rdx, %r9
+ adc \$0x0,%r10
+
+ xor %r8, %r8
+ mov 0x10(%rsi), %rax
+ mul %r11
+ add %rax, %r9
+ adc %rdx, %r10
+ adc \$0x0,%r8
+
+ mov (%rsi),%r13
+ mov %r14, %rax
+ mul %r13
+ add %rax, %r9
+ adc %rdx, %r10
+ adc \$0x0,%r8
+
+ mov 0x8(%rsi), %rax
+ mul %r12
+ add %rax, %r9
+ mov %r9, 0x10(%rcx) # c2
+ adc %rdx, %r10
+ adc \$0x0,%r8
+
+ xor %r9, %r9
+ mov 0x18(%rsi), %rax
+ mul %r11
+ mov 0x18(%rdi), %r15
+ add %rax, %r10
+ adc %rdx, %r8
+ adc \$0x0,%r9
+
+ mov %r15, %rax
+ mul %r13
+ add %rax, %r10
+ adc %rdx, %r8
+ adc \$0x0,%r9
+
+ mov 0x10(%rsi), %rax
+ mul %r12
+ add %rax, %r10
+ adc %rdx, %r8
+ adc \$0x0,%r9
+
+ mov 0x8(%rsi), %rax
+ mul %r14
+ add %rax, %r10
+ mov %r10, 0x18(%rcx) # c3
+ adc %rdx, %r8
+ adc \$0x0,%r9
+
+ xor %r10, %r10
+ mov 0x18(%rsi), %rax
+ mul %r12
+ add %rax, %r8
+ adc %rdx, %r9
+ adc \$0x0,%r10
+
+ mov 0x8(%rsi), %rax
+ mul %r15
+ add %rax, %r8
+ adc %rdx, %r9
+ adc \$0x0,%r10
+
+ mov 0x10(%rsi), %rax
+ mul %r14
+ add %rax, %r8
+ mov %r8, 0x20(%rcx) # c4
+ adc %rdx, %r9
+ adc \$0x0,%r10
+
+ xor %r8, %r8
+ mov 0x18(%rsi), %rax
+ mul %r14
+ add %rax, %r9
+ adc %rdx, %r10
+ adc \$0x0,%r8
+
+ mov 0x10(%rsi), %rax
+ mul %r15
+ add %rax, %r9
+ mov %r9, 0x28(%rcx) # c5
+ adc %rdx, %r10
+ adc \$0x0,%r8
+
+ mov 0x18(%rsi), %rax
+ mul %r15
+ add %rax, %r10
+ mov %r10, 0x30(%rcx) # c6
+ adc %rdx, %r8
+ mov %r8, 0x38(%rcx) # c7
+
+ # rcx[0x40-0x68] <- AH*BH
+ # multiplies 2 192-bit numbers A,B
+ mov 0x20(%rdi), %r11
+ mov 0x20(%rsi), %rax
+ mul %r11
+ xor %r9, %r9
+ mov %rax, 0x40(%rcx) # c0
+ mov %rdx, %r8
+
+ mov 0x30(%rdi), %r14
+ mov 0x28(%rsi), %rax
+ mul %r11
+ xor %r10, %r10
+ add %rax, %r8
+ adc %rdx, %r9
+
+ mov 0x28(%rdi), %r12
+ mov 0x20(%rsi), %rax
+ mul %r12
+ add %rax, %r8
+ mov %r8, 0x48(%rcx) # c1
+ adc %rdx, %r9
+ adc \$0x0,%r10
+
+ xor %r8, %r8
+ mov 0x30(%rsi), %rax
+ mul %r11
+ add %rax, %r9
+ adc %rdx, %r10
+ adc \$0x0,%r8
+
+ mov 0x20(%rsi), %r13
+ mov %r14, %rax
+ mul %r13
+ add %rax, %r9
+ adc %rdx, %r10
+ adc \$0x0,%r8
+
+ mov 0x28(%rsi), %rax
+ mul %r12
+ add %rax, %r9
+ mov %r9, 0x50(%rcx) # c2
+ adc %rdx, %r10
+ adc \$0x0,%r8
+
+ mov 0x30(%rsi), %rax
+ mul %r12
+ xor %r12, %r12
+ add %rax, %r10
+ adc %rdx, %r8
+ adc \$0x0,%r12
+
+ mov 0x28(%rsi), %rax
+ mul %r14
+ add %rax, %r10
+ adc %rdx, %r8
+ adc \$0x0,%r12
+ mov %r10, 0x58(%rcx) # c3
+
+ mov 0x30(%rsi), %rax
+ mul %r14
+ add %rax, %r8
+ adc \$0x0,%r12
+ mov %r8, 0x60(%rcx) # c4
+
+ add %r12, %rdx # c5
+
+ # [r8-r15] <- (AH+AL)x(BH+BL) - ALxBL
+ mov 0x0(%rsp), %r8
+ sub 0x0(%rcx), %r8
+ mov 0x8(%rsp), %r9
+ sbb 0x8(%rcx), %r9
+ mov 0x10(%rsp), %r10
+ sbb 0x10(%rcx), %r10
+ mov 0x18(%rsp), %r11
+ sbb 0x18(%rcx), %r11
+ mov 0x50(%rsp), %r12
+ sbb 0x20(%rcx), %r12
+ mov 0x58(%rsp), %r13
+ sbb 0x28(%rcx), %r13
+ mov 0x60(%rsp), %r14
+ sbb 0x30(%rcx), %r14
+ mov 0x68(%rsp), %r15
+ sbb 0x38(%rcx), %r15
+
+ # [r8-r15] <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
+ mov 0x40(%rcx), %rax
+ sub %rax, %r8
+ mov 0x48(%rcx), %rax
+ sbb %rax, %r9
+ mov 0x50(%rcx), %rax
+ sbb %rax, %r10
+ mov 0x58(%rcx), %rax
+ sbb %rax, %r11
+ mov 0x60(%rcx), %rax
+ sbb %rax, %r12
+ sbb %rdx, %r13
+ sbb \$0x0,%r14
+ sbb \$0x0,%r15
+
+ # Final result
+ add 0x20(%rcx), %r8
+ mov %r8, 0x20(%rcx) # Result C4-C7
+ adc 0x28(%rcx), %r9
+ mov %r9, 0x28(%rcx)
+ adc 0x30(%rcx), %r10
+ mov %r10, 0x30(%rcx)
+ adc 0x38(%rcx), %r11
+ mov %r11, 0x38(%rcx)
+ adc 0x40(%rcx), %r12
+ mov %r12, 0x40(%rcx) # Result C8-C13
+ adc 0x48(%rcx), %r13
+ mov %r13, 0x48(%rcx)
+ adc 0x50(%rcx), %r14
+ mov %r14, 0x50(%rcx)
+ adc 0x58(%rcx), %r15
+ mov %r15, 0x58(%rcx)
+ mov 0x60(%rcx), %r12
+ adc \$0x0, %r12
+ mov %r12, 0x60(%rcx)
+ adc \$0x0, %rdx
+ mov %rdx, 0x68(%rcx)
+
+ add \$112, %rsp # Restoring space in stack
+ .cfi_adjust_cfa_offset -112
+___
+
+ return $code;
+}
+
+# Integer multiplication based on Karatsuba method
+# Operation: c [rdx] = a [rdi] * b [rsi]
+# NOTE: a=c or b=c are not allowed
+sub sike_mul {
+ my $jump_mul_bdw=&alt_impl(".Lmul_bdw") if ($bmi2_adx);
+ # MUL for Broadwell CPUs
+ my $mul_bdw=&mul_bdw() if ($bmi2_adx);
+ # MUL for CPUs older than Broadwell
+ my $mul=&mul();
+
+ my $body=<<___;
+ .Lmul_bdw:
+ .cfi_startproc
+ # sike_mpmul has already pushed r12--15 by this point.
+ .cfi_adjust_cfa_offset 32
+ .cfi_offset r12, -16
+ .cfi_offset r13, -24
+ .cfi_offset r14, -32
+ .cfi_offset r15, -40
+
+ $mul_bdw
+
pop %r15
.cfi_adjust_cfa_offset -8
.cfi_same_value r15
@@ -819,34 +1578,6 @@
ret
.cfi_endproc
-___
- return $body;
-}
-
-# Jump to alternative implemenatation provided as an
-# argument in case CPU supports ADOX/ADCX and MULX instructions.
-sub alt_impl {
- $jmp_func = shift;
-
- $body=<<___;
- lea OPENSSL_ia32cap_P(%rip), %rcx
- mov 8(%rcx), %rcx
- and \$0x80100, %ecx
- cmp \$0x80100, %ecx
- je $jmp_func
-
-___
- return $body
-}
-
-# Integer multiplication based on Karatsuba method
-# Operation: c [rdx] = a [rdi] * b [rsi]
-# NOTE: a=c or b=c are not allowed
-sub mul {
- my $jump_optim.=&alt_impl(".Lmul_mulx") if ($bmi2_adx);
- my $body.=&mul_mulx() if ($bmi2_adx);
-
- $body.=<<___;
.globl ${PREFIX}_mpmul
.type ${PREFIX}_mpmul,\@function,3
${PREFIX}_mpmul:
@@ -864,461 +1595,12 @@
.cfi_adjust_cfa_offset 8
.cfi_offset r15, -40
- $jump_optim
+ # Jump to optimized implementation if
+ # CPU supports ADCX/ADOX/MULX
+ $jump_mul_bdw
+ # Otherwise use generic implementation
+ $mul
- mov %rdx, %rcx
-
- # rcx[0-3] <- AH+AL
- xor %rax, %rax
- mov 0x20(%rdi), %r8
- mov 0x28(%rdi), %r9
- mov 0x30(%rdi), %r10
- mov 0x38(%rdi), %r11
- add 0x0(%rdi), %r8
- adc 0x8(%rdi), %r9
- adc 0x10(%rdi), %r10
- adc 0x18(%rdi), %r11
- mov %r8, 0x0(%rcx)
- mov %r9, 0x8(%rcx)
- mov %r10, 0x10(%rcx)
- mov %r11, 0x18(%rcx)
- sbb \$0, %rax
- sub \$80, %rsp # Allocating space in stack
- .cfi_adjust_cfa_offset 80
-
- # r12-r15 <- BH+BL
- xor %rdx, %rdx
- mov 0x20(%rsi), %r12
- mov 0x28(%rsi), %r13
- mov 0x30(%rsi), %r14
- mov 0x38(%rsi), %r15
- add 0x0(%rsi), %r12
- adc 0x8(%rsi), %r13
- adc 0x10(%rsi), %r14
- adc 0x18(%rsi), %r15
- sbb \$0x0, %rdx
- mov %rax, 0x40(%rsp)
- mov %rdx, 0x48(%rsp)
-
- # (rsp[0-3],r8,r9,r10,r11) <- (AH+AL)*(BH+BL)
- mov (%rcx), %rax
- mul %r12
- mov %rax, (%rsp) # c0
- mov %rdx, %r8
-
- xor %r9, %r9
- mov (%rcx), %rax
- mul %r13
- add %rax, %r8
- adc %rdx, %r9
-
- xor %r10, %r10
- mov 0x8(%rcx), %rax
- mul %r12
- add %rax, %r8
- mov %r8, 0x8(%rsp) # c1
- adc %rdx, %r9
- adc \$0x0, %r10
-
- xor %r8, %r8
- mov (%rcx), %rax
- mul %r14
- add %rax, %r9
- adc %rdx, %r10
- adc \$0x0, %r8
-
- mov 0x10(%rcx), %rax
- mul %r12
- add %rax, %r9
- adc %rdx, %r10
- adc \$0x0, %r8
-
- mov 0x8(%rcx), %rax
- mul %r13
- add %rax, %r9
- mov %r9, 0x10(%rsp) # c2
- adc %rdx, %r10
- adc \$0x0, %r8
-
- xor %r9, %r9
- mov (%rcx), %rax
- mul %r15
- add %rax, %r10
- adc %rdx, %r8
- adc \$0x0, %r9
-
- mov 0x18(%rcx), %rax
- mul %r12
- add %rax, %r10
- adc %rdx, %r8
- adc \$0x0, %r9
-
- mov 0x8(%rcx), %rax
- mul %r14
- add %rax, %r10
- adc %rdx, %r8
- adc \$0x0, %r9
-
- mov 0x10(%rcx), %rax
- mul %r13
- add %rax, %r10
- mov %r10, 0x18(%rsp) # c3
- adc %rdx, %r8
- adc \$0x0, %r9
-
- xor %r10, %r10
- mov 0x8(%rcx), %rax
- mul %r15
- add %rax, %r8
- adc %rdx, %r9
- adc \$0x0, %r10
-
- mov 0x18(%rcx), %rax
- mul %r13
- add %rax, %r8
- adc %rdx, %r9
- adc \$0x0, %r10
-
- mov 0x10(%rcx), %rax
- mul %r14
- add %rax, %r8
- mov %r8, 0x20(%rsp) # c4
- adc %rdx, %r9
- adc \$0x0, %r10
-
- xor %r11, %r11
- mov 0x10(%rcx), %rax
- mul %r15
- add %rax, %r9
- adc %rdx, %r10
- adc \$0x0, %r11
-
- mov 0x18(%rcx), %rax
- mul %r14
- add %rax, %r9 # c5
- adc %rdx, %r10
- adc \$0x0, %r11
-
- mov 0x18(%rcx), %rax
- mul %r15
- add %rax, %r10 # c6
- adc %rdx, %r11 # c7
-
- mov 0x40(%rsp), %rax
- and %rax, %r12
- and %rax, %r13
- and %rax, %r14
- and %rax, %r15
- add %r8, %r12
- adc %r9, %r13
- adc %r10, %r14
- adc %r11, %r15
-
- mov 0x48(%rsp), %rax
- mov (%rcx), %r8
- mov 0x8(%rcx), %r9
- mov 0x10(%rcx), %r10
- mov 0x18(%rcx), %r11
- and %rax, %r8
- and %rax, %r9
- and %rax, %r10
- and %rax, %r11
- add %r12, %r8
- adc %r13, %r9
- adc %r14, %r10
- adc %r15, %r11
- mov %r8, 0x20(%rsp)
- mov %r9, 0x28(%rsp)
- mov %r10, 0x30(%rsp)
- mov %r11, 0x38(%rsp)
-
- mov (%rdi), %r11
- mov (%rsi), %rax
- mul %r11
- xor %r9, %r9
- mov %rax, (%rcx) # c0
- mov %rdx, %r8
-
- mov 0x10(%rdi), %r14
- mov 0x8(%rsi), %rax
- mul %r11
- xor %r10, %r10
- add %rax, %r8
- adc %rdx, %r9
-
- mov 0x8(%rdi), %r12
- mov (%rsi), %rax
- mul %r12
- add %rax, %r8
- mov %r8, 0x8(%rcx) # c1
- adc %rdx, %r9
- adc \$0x0, %r10
-
- xor %r8, %r8
- mov 0x10(%rsi), %rax
- mul %r11
- add %rax, %r9
- adc %rdx, %r10
- adc \$0x0, %r8
-
- mov (%rsi), %r13
- mov %r14, %rax
- mul %r13
- add %rax, %r9
- adc %rdx, %r10
- adc \$0x0, %r8
-
- mov 0x8(%rsi), %rax
- mul %r12
- add %rax, %r9
- mov %r9, 0x10(%rcx) # c2
- adc %rdx, %r10
- adc \$0x0, %r8
-
- xor %r9, %r9
- mov 0x18(%rsi), %rax
- mul %r11
- mov 0x18(%rdi), %r15
- add %rax, %r10
- adc %rdx, %r8
- adc \$0x0, %r9
-
- mov %r15, %rax
- mul %r13
- add %rax, %r10
- adc %rdx, %r8
- adc \$0x0, %r9
-
- mov 0x10(%rsi), %rax
- mul %r12
- add %rax, %r10
- adc %rdx, %r8
- adc \$0x0, %r9
-
- mov 0x8(%rsi), %rax
- mul %r14
- add %rax, %r10
- mov %r10, 0x18(%rcx) # c3
- adc %rdx, %r8
- adc \$0x0, %r9
-
- xor %r10, %r10
- mov 0x18(%rsi), %rax
- mul %r12
- add %rax, %r8
- adc %rdx, %r9
- adc \$0x0, %r10
-
- mov 0x8(%rsi), %rax
- mul %r15
- add %rax, %r8
- adc %rdx, %r9
- adc \$0x0, %r10
-
- mov 0x10(%rsi), %rax
- mul %r14
- add %rax, %r8
- mov %r8, 0x20(%rcx) # c4
- adc %rdx, %r9
- adc \$0x0, %r10
-
- xor %r8, %r8
- mov 0x18(%rsi), %rax
- mul %r14
- add %rax, %r9
- adc %rdx, %r10
- adc \$0x0, %r8
-
- mov 0x10(%rsi), %rax
- mul %r15
- add %rax, %r9
- mov %r9, 0x28(%rcx) # c5
- adc %rdx, %r10
- adc \$0x0, %r8
-
- mov 0x18(%rsi), %rax
- mul %r15
- add %rax, %r10
- mov %r10, 0x30(%rcx) # c6
- adc %rdx, %r8
- mov %r8, 0x38(%rcx) # c7
-
- # rcx[8-15] <- AH*BH
- mov 0x20(%rdi), %r11
- mov 0x20(%rsi), %rax
- mul %r11
- xor %r9, %r9
- mov %rax, 0x40(%rcx) # c0
- mov %rdx, %r8
-
- mov 0x30(%rdi), %r14
- mov 0x28(%rsi), %rax
- mul %r11
- xor %r10, %r10
- add %rax, %r8
- adc %rdx, %r9
-
- mov 0x28(%rdi), %r12
- mov 0x20(%rsi), %rax
- mul %r12
- add %rax, %r8
- mov %r8, 0x48(%rcx) # c1
- adc %rdx, %r9
- adc \$0x0, %r10
-
- xor %r8, %r8
- mov 0x30(%rsi), %rax
- mul %r11
- add %rax, %r9
- adc %rdx, %r10
- adc \$0x0, %r8
-
- mov 0x20(%rsi), %r13
- mov %r14, %rax
- mul %r13
- add %rax, %r9
- adc %rdx, %r10
- adc \$0x0, %r8
-
- mov 0x28(%rsi), %rax
- mul %r12
- add %rax, %r9
- mov %r9, 0x50(%rcx) # c2
- adc %rdx, %r10
- adc \$0x0, %r8
-
- xor %r9, %r9
- mov 0x38(%rsi), %rax
- mul %r11
- mov 0x38(%rdi), %r15
- add %rax, %r10
- adc %rdx, %r8
- adc \$0x0, %r9
-
- mov %r15, %rax
- mul %r13
- add %rax, %r10
- adc %rdx, %r8
- adc \$0x0, %r9
-
- mov 0x30(%rsi), %rax
- mul %r12
- add %rax, %r10
- adc %rdx, %r8
- adc \$0x0, %r9
-
- mov 0x28(%rsi), %rax
- mul %r14
- add %rax, %r10
- mov %r10, 0x58(%rcx) # c3
- adc %rdx, %r8
- adc \$0x0, %r9
-
- xor %r10, %r10
- mov 0x38(%rsi), %rax
- mul %r12
- add %rax, %r8
- adc %rdx, %r9
- adc \$0x0, %r10
-
- mov 0x28(%rsi), %rax
- mul %r15
- add %rax, %r8
- adc %rdx, %r9
- adc \$0x0, %r10
-
- mov 0x30(%rsi), %rax
- mul %r14
- add %rax, %r8
- mov %r8, 0x60(%rcx) # c4
- adc %rdx, %r9
- adc \$0x0, %r10
-
- xor %r8, %r8
- mov 0x38(%rsi), %rax
- mul %r14
- add %rax, %r9
- adc %rdx, %r10
- adc \$0x0, %r8
-
- mov 0x30(%rsi), %rax
- mul %r15
- add %rax, %r9
- mov %r9, 0x68(%rcx) # c5
- adc %rdx, %r10
- adc \$0x0, %r8
-
- mov 0x38(%rsi), %rax
- mul %r15
- add %rax, %r10
- mov %r10, 0x70(%rcx) # c6
- adc %rdx, %r8
- mov %r8, 0x78(%rcx) # c7
-
- # [r8-r15] <- (AH+AL)*(BH+BL) - AL*BL
- mov 0x0(%rsp), %r8
- sub 0x0(%rcx), %r8
- mov 0x8(%rsp), %r9
- sbb 0x8(%rcx), %r9
- mov 0x10(%rsp), %r10
- sbb 0x10(%rcx), %r10
- mov 0x18(%rsp), %r11
- sbb 0x18(%rcx), %r11
- mov 0x20(%rsp), %r12
- sbb 0x20(%rcx), %r12
- mov 0x28(%rsp), %r13
- sbb 0x28(%rcx), %r13
- mov 0x30(%rsp), %r14
- sbb 0x30(%rcx), %r14
- mov 0x38(%rsp), %r15
- sbb 0x38(%rcx), %r15
-
- # [r8-r15] <- (AH+AL)*(BH+BL) - AL*BL - AH*BH
- mov 0x40(%rcx), %rax
- sub %rax, %r8
- mov 0x48(%rcx), %rax
- sbb %rax, %r9
- mov 0x50(%rcx), %rax
- sbb %rax, %r10
- mov 0x58(%rcx), %rax
- sbb %rax, %r11
- mov 0x60(%rcx), %rax
- sbb %rax, %r12
- mov 0x68(%rcx), %rdx
- sbb %rdx, %r13
- mov 0x70(%rcx), %rdi
- sbb %rdi, %r14
- mov 0x78(%rcx), %rsi
- sbb %rsi, %r15
-
- # Final result
- add 0x20(%rcx), %r8
- mov %r8, 0x20(%rcx)
- adc 0x28(%rcx), %r9
- mov %r9, 0x28(%rcx)
- adc 0x30(%rcx), %r10
- mov %r10, 0x30(%rcx)
- adc 0x38(%rcx), %r11
- mov %r11, 0x38(%rcx)
- adc 0x40(%rcx), %r12
- mov %r12, 0x40(%rcx)
- adc 0x48(%rcx), %r13
- mov %r13, 0x48(%rcx)
- adc 0x50(%rcx), %r14
- mov %r14, 0x50(%rcx)
- adc 0x58(%rcx), %r15
- mov %r15, 0x58(%rcx)
- adc \$0x0, %rax
- mov %rax, 0x60(%rcx)
- adc \$0x0, %rdx
- mov %rdx, 0x68(%rcx)
- adc \$0x0, %rdi
- mov %rdi, 0x70(%rcx)
- adc \$0x0, %rsi
- mov %rsi, 0x78(%rcx)
-
- add \$80, %rsp # Restoring space in stack
- .cfi_adjust_cfa_offset -80
pop %r15
.cfi_adjust_cfa_offset -8
pop %r14
@@ -1334,513 +1616,7 @@
return $body;
}
-$code.=&mul();
-
-# Optimized Montgomery reduction for CPUs with ADOX/ADCX and MULX
-# Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015
-# Operation: c [rsi] = a [rdi]
-# NOTE: a=c is not allowed
-sub rdc_mulx {
- # a[0-1] x .Lp503p1_nz --> result: r8:r14
- my $mul01=&mul128x320_school(0,"rdi",".Lp503p1_nz(%rip)",map("r$_",(8..14)),"rbx","rcx","r15");
- # a[2-3] x .Lp503p1_nz --> result: r8:r14
- my $mul23=&mul128x320_school(16,"rdi",".Lp503p1_nz(%rip)",map("r$_",(8..14)),"rbx","rcx","r15");
- # a[4-5] x .Lp503p1_nz --> result: r8:r14
- my $mul45=&mul128x320_school(32,"rdi",".Lp503p1_nz(%rip)",map("r$_",(8..14)),"rbx","rcx","r15");
- # a[6-7] x .Lp503p1_nz --> result: r8:r14
- my $mul67=&mul128x320_school(48,"rdi",".Lp503p1_nz(%rip)",map("r$_", (8..14)),"rbx","rcx","r15");
-
- my $body=<<___;
- .Lrdc_mulx_asm:
- .cfi_startproc
- # sike_fprdc has already pushed r12--15 and rbx by this point.
- .cfi_adjust_cfa_offset 32
- .cfi_offset r12, -16
- .cfi_offset r13, -24
- .cfi_offset r14, -32
- .cfi_offset r15, -40
- .cfi_offset rbx, -48
- .cfi_adjust_cfa_offset 8
-
- $mul01
-
- xor %r15, %r15
- add 0x18(%rdi), %r8
- adc 0x20(%rdi), %r9
- adc 0x28(%rdi), %r10
- adc 0x30(%rdi), %r11
- adc 0x38(%rdi), %r12
- adc 0x40(%rdi), %r13
- adc 0x48(%rdi), %r14
- adc 0x50(%rdi), %r15
- mov %r8, 0x18(%rdi)
- mov %r9, 0x20(%rdi)
- mov %r10, 0x28(%rdi)
- mov %r11, 0x30(%rdi)
- mov %r12, 0x38(%rdi)
- mov %r13, 0x40(%rdi)
- mov %r14, 0x48(%rdi)
- mov %r15, 0x50(%rdi)
- mov 0x58(%rdi), %r8
- mov 0x60(%rdi), %r9
- mov 0x68(%rdi), %r10
- mov 0x70(%rdi), %r11
- mov 0x78(%rdi), %r12
- adc \$0x0, %r8
- adc \$0x0, %r9
- adc \$0x0, %r10
- adc \$0x0, %r11
- adc \$0x0, %r12
- mov %r8, 0x58(%rdi)
- mov %r9, 0x60(%rdi)
- mov %r10, 0x68(%rdi)
- mov %r11, 0x70(%rdi)
- mov %r12, 0x78(%rdi)
-
- $mul23
-
- xor %r15, %r15
- add 0x28(%rdi), %r8
- adc 0x30(%rdi), %r9
- adc 0x38(%rdi), %r10
- adc 0x40(%rdi), %r11
- adc 0x48(%rdi), %r12
- adc 0x50(%rdi), %r13
- adc 0x58(%rdi), %r14
- adc 0x60(%rdi), %r15
- mov %r8, 0x28(%rdi)
- mov %r9, 0x30(%rdi)
- mov %r10, 0x38(%rdi)
- mov %r11, 0x40(%rdi)
- mov %r12, 0x48(%rdi)
- mov %r13, 0x50(%rdi)
- mov %r14, 0x58(%rdi)
- mov %r15, 0x60(%rdi)
- mov 0x68(%rdi), %r8
- mov 0x70(%rdi), %r9
- mov 0x78(%rdi), %r10
- adc \$0x0, %r8
- adc \$0x0, %r9
- adc \$0x0, %r10
- mov %r8, 0x68(%rdi)
- mov %r9, 0x70(%rdi)
- mov %r10, 0x78(%rdi)
-
- $mul45
-
- xor %r15, %r15
- xor %rbx, %rbx
- add 0x38(%rdi), %r8
- adc 0x40(%rdi), %r9
- adc 0x48(%rdi), %r10
- adc 0x50(%rdi), %r11
- adc 0x58(%rdi), %r12
- adc 0x60(%rdi), %r13
- adc 0x68(%rdi), %r14
- adc 0x70(%rdi), %r15
- adc 0x78(%rdi), %rbx
- mov %r8, 0x38(%rdi)
- mov %r9, (%rsi) # Final result c0
- mov %r10, 0x48(%rdi)
- mov %r11, 0x50(%rdi)
- mov %r12, 0x58(%rdi)
- mov %r13, 0x60(%rdi)
- mov %r14, 0x68(%rdi)
- mov %r15, 0x70(%rdi)
- mov %rbx, 0x78(%rdi)
-
- $mul67
-
- add 0x48(%rdi), %r8
- adc 0x50(%rdi), %r9
- adc 0x58(%rdi), %r10
- adc 0x60(%rdi), %r11
- adc 0x68(%rdi), %r12
- adc 0x70(%rdi), %r13
- adc 0x78(%rdi), %r14
- mov %r8, 0x8(%rsi)
- mov %r9, 0x10(%rsi)
- mov %r10, 0x18(%rsi)
- mov %r11, 0x20(%rsi)
- mov %r12, 0x28(%rsi)
- mov %r13, 0x30(%rsi)
- mov %r14, 0x38(%rsi)
-
- pop %rbx
- .cfi_adjust_cfa_offset -8
- .cfi_same_value rbx
- pop %r15
- .cfi_adjust_cfa_offset -8
- .cfi_same_value r15
- pop %r14
- .cfi_adjust_cfa_offset -8
- .cfi_same_value r14
- pop %r13
- .cfi_adjust_cfa_offset -8
- .cfi_same_value r13
- pop %r12
- .cfi_adjust_cfa_offset -8
- .cfi_same_value r12
- ret
- .cfi_endproc
-___
- return $body;
-}
-
-# Montgomery reduction
-# Based on comba method
-# Operation: c [rsi] = a [rdi]
-# NOTE: a=c is not allowed
-sub rdc {
- my $jump_optim=&alt_impl(".Lrdc_mulx_asm") if ($bmi2_adx);
- my $body=&rdc_mulx() if ($bmi2_adx);
-
- $body.=<<___;
- .globl ${PREFIX}_fprdc
- .type ${PREFIX}_fprdc,\@function,3
- ${PREFIX}_fprdc:
- .cfi_startproc
- push %r12
- .cfi_adjust_cfa_offset 8
- .cfi_offset r12, -16
- push %r13
- .cfi_adjust_cfa_offset 8
- .cfi_offset r13, -24
- push %r14
- .cfi_adjust_cfa_offset 8
- .cfi_offset r14, -32
- push %r15
- .cfi_adjust_cfa_offset 8
- .cfi_offset r15, -40
- push %rbx
- .cfi_adjust_cfa_offset 8
- .cfi_offset rbx, -48
-
- $jump_optim
-
- # Reduction, generic x86 implementation
- lea .Lp503p1(%rip), %rbx
-
- mov (%rdi), %r11
- mov (%rbx), %rax
- mul %r11
- xor %r8, %r8
- add 0x18(%rdi), %rax
- mov %rax, 0x18(%rsi) # z3
- adc %rdx, %r8
-
- xor %r9, %r9
- mov 0x8(%rbx), %rax
- mul %r11
- xor %r10, %r10
- add %rax, %r8
- adc %rdx, %r9
-
- mov 0x8(%rdi), %r12
- mov (%rbx), %rax
- mul %r12
- add %rax, %r8
- adc %rdx, %r9
- adc \$0, %r10
- add 0x20(%rdi), %r8
- mov %r8, 0x20(%rsi) # z4
- adc \$0, %r9
- adc \$0, %r10
-
- xor %r8, %r8
- mov 0x10(%rbx), %rax
- mul %r11
- add %rax, %r9
- adc %rdx, %r10
- adc \$0, %r8
-
- mov 8(%rbx), %rax
- mul %r12
- add %rax, %r9
- adc %rdx, %r10
- adc \$0, %r8
-
- mov 0x10(%rdi), %r13
- mov (%rbx), %rax
- mul %r13
- add %rax, %r9
- adc %rdx, %r10
- adc \$0, %r8
- add 0x28(%rdi), %r9
- mov %r9, 0x28(%rsi) # z5
- adc \$0, %r10
- adc \$0, %r8
-
- xor %r9, %r9
- mov 0x18(%rbx), %rax
- mul %r11
- add %rax, %r10
- adc %rdx, %r8
- adc \$0, %r9
-
- mov 0x10(%rbx), %rax
- mul %r12
- add %rax, %r10
- adc %rdx, %r8
- adc \$0, %r9
-
- mov 0x8(%rbx), %rax
- mul %r13
- add %rax, %r10
- adc %rdx, %r8
- adc \$0, %r9
-
- mov 0x18(%rsi), %r14
- mov (%rbx), %rax
- mul %r14
- add %rax, %r10
- adc %rdx, %r8
- adc \$0, %r9
- add 0x30(%rdi), %r10
- mov %r10, 0x30(%rsi) # z6
- adc \$0, %r8
- adc \$0, %r9
-
- xor %r10, %r10
- mov 0x20(%rbx), %rax
- mul %r11
- add %rax, %r8
- adc %rdx, %r9
- adc \$0, %r10
-
- mov 0x18(%rbx), %rax
- mul %r12
- add %rax, %r8
- adc %rdx, %r9
- adc \$0, %r10
-
- mov 0x10(%rbx), %rax
- mul %r13
- add %rax, %r8
- adc %rdx, %r9
- adc \$0, %r10
-
- mov 0x8(%rbx), %rax
- mul %r14
- add %rax, %r8
- adc %rdx, %r9
- adc \$0, %r10
-
- mov 0x20(%rsi), %r15
- mov (%rbx), %rax
- mul %r15
- add %rax, %r8
- adc %rdx, %r9
- adc \$0, %r10
- add 0x38(%rdi), %r8 # Z7
- mov %r8, 0x38(%rsi)
- adc \$0, %r9
- adc \$0, %r10
-
- xor %r8, %r8
- mov 0x20(%rbx), %rax
- mul %r12
- add %rax, %r9
- adc %rdx, %r10
- adc \$0, %r8
-
- mov 0x18(%rbx), %rax
- mul %r13
- add %rax, %r9
- adc %rdx, %r10
- adc \$0, %r8
-
- mov 0x10(%rbx), %rax
- mul %r14
- add %rax, %r9
- adc %rdx, %r10
- adc \$0, %r8
-
- mov 0x8(%rbx), %rax
- mul %r15
- add %rax, %r9
- adc %rdx, %r10
- adc \$0, %r8
-
- mov 0x28(%rsi), %rcx
- mov (%rbx), %rax
- mul %rcx
- add %rax, %r9
- adc %rdx, %r10
- adc \$0, %r8
- add 0x40(%rdi), %r9
- mov %r9, (%rsi) # Z9
- adc \$0, %r10
- adc \$0, %r8
-
- xor %r9, %r9
- mov 0x20(%rbx), %rax
- mul %r13
- add %rax, %r10
- adc %rdx, %r8
- adc \$0, %r9
-
- mov 0x18(%rbx), %rax
- mul %r14
- add %rax, %r10
- adc %rdx, %r8
- adc \$0, %r9
-
- mov 0x10(%rbx), %rax
- mul %r15
- add %rax, %r10
- adc %rdx, %r8
- adc \$0, %r9
-
- mov 8(%rbx), %rax
- mul %rcx
- add %rax, %r10
- adc %rdx, %r8
- adc \$0, %r9
-
- mov 0x30(%rsi), %r13
- mov (%rbx), %rax
- mul %r13
- add %rax, %r10
- adc %rdx, %r8
- adc \$0, %r9
- add 0x48(%rdi), %r10
- mov %r10, 0x8(%rsi) # Z1
- adc \$0, %r8
- adc \$0, %r9
-
- xor %r10, %r10
- mov 0x20(%rbx), %rax
- mul %r14
- add %rax, %r8
- adc %rdx, %r9
- adc \$0, %r10
-
- mov 0x18(%rbx), %rax
- mul %r15
- add %rax, %r8
- adc %rdx, %r9
- adc \$0, %r10
-
- mov 0x10(%rbx), %rax
- mul %rcx
- add %rax, %r8
- adc %rdx, %r9
- adc \$0, %r10
-
- mov 8(%rbx), %rax
- mul %r13
- add %rax, %r8
- adc %rdx, %r9
- adc \$0, %r10
-
- mov 0x38(%rsi), %r14
- mov (%rbx), %rax
- mul %r14
- add %rax, %r8
- adc %rdx, %r9
- adc \$0, %r10
- add 0x50(%rdi), %r8
- mov %r8, 0x10(%rsi) # Z2
- adc \$0, %r9
- adc \$0, %r10
-
- xor %r8, %r8
- mov 0x20(%rbx), %rax
- mul %r15
- add %rax, %r9
- adc %rdx, %r10
- adc \$0, %r8
-
- mov 0x18(%rbx), %rax
- mul %rcx
- add %rax, %r9
- adc %rdx, %r10
- adc \$0, %r8
-
- mov 0x10(%rbx), %rax
- mul %r13
- add %rax, %r9
- adc %rdx, %r10
- adc \$0, %r8
-
- mov 8(%rbx), %rax
- mul %r14
- add %rax, %r9
- adc %rdx, %r10
- adc \$0, %r8
- add 0x58(%rdi), %r9
- mov %r9, 0x18(%rsi) # Z3
- adc \$0, %r10
- adc \$0, %r8
-
- xor %r9, %r9
- mov 0x20(%rbx), %rax
- mul %rcx
- add %rax, %r10
- adc %rdx, %r8
- adc \$0, %r9
-
- mov 0x18(%rbx), %rax
- mul %r13
- add %rax, %r10
- adc %rdx, %r8
- adc \$0, %r9
-
- mov 0x10(%rbx), %rax
- mul %r14
- add %rax, %r10
- adc %rdx, %r8
- adc \$0, %r9
- add 0x60(%rdi), %r10
- mov %r10, 0x20(%rsi) # Z4
- adc \$0, %r8
- adc \$0, %r9
-
- xor %r10, %r10
- mov 0x20(%rbx), %rax
- mul %r13
- add %rax, %r8
- adc %rdx, %r9
- adc \$0, %r10
-
- mov 0x18(%rbx), %rax
- mul %r14
- add %rax, %r8
- adc %rdx, %r9
- adc \$0, %r10
- add 0x68(%rdi), %r8 # Z5
- mov %r8, 0x28(%rsi) # Z5
- adc \$0, %r9
- adc \$0, %r10
-
- mov 0x20(%rbx), %rax
- mul %r14
- add %rax, %r9
- adc %rdx, %r10
- add 0x70(%rdi), %r9 # Z6
- mov %r9, 0x30(%rsi) # Z6
- adc \$0, %r10
- add 0x78(%rdi), %r10 # Z7
- mov %r10, 0x38(%rsi) # Z7
-
- pop %rbx
- .cfi_adjust_cfa_offset -8
- pop %r15
- .cfi_adjust_cfa_offset -8
- pop %r14
- .cfi_adjust_cfa_offset -8
- pop %r13
- .cfi_adjust_cfa_offset -8
- pop %r12
- .cfi_adjust_cfa_offset -8
- ret
- .cfi_endproc
-___
- return $body;
-}
-
-$code.=&rdc();
+$code.=&sike_mul();
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/ge;
diff --git a/third_party/sike/asm/fp_generic.c b/third_party/sike/asm/fp_generic.c
index 60e0da1..cdf8755 100644
--- a/third_party/sike/asm/fp_generic.c
+++ b/third_party/sike/asm/fp_generic.c
@@ -13,7 +13,7 @@
#include "../fpx.h"
// Global constants
-extern const struct params_t p503;
+extern const struct params_t params;
static void digit_x_digit(const crypto_word_t a, const crypto_word_t b, crypto_word_t* c)
{ // Digit multiplication, digit * digit -> 2-digit result
@@ -50,9 +50,9 @@
}
void sike_fpadd(const felm_t a, const felm_t b, felm_t c)
-{ // Modular addition, c = a+b mod p503.
- // Inputs: a, b in [0, 2*p503-1]
- // Output: c in [0, 2*p503-1]
+{ // Modular addition, c = a+b mod p434.
+ // Inputs: a, b in [0, 2*p434-1]
+ // Output: c in [0, 2*p434-1]
unsigned int i, carry = 0;
crypto_word_t mask;
@@ -62,20 +62,20 @@
carry = 0;
for (i = 0; i < NWORDS_FIELD; i++) {
- SUBC(carry, c[i], p503.prime_x2[i], carry, c[i]);
+ SUBC(carry, c[i], params.prime_x2[i], carry, c[i]);
}
mask = 0 - (crypto_word_t)carry;
carry = 0;
for (i = 0; i < NWORDS_FIELD; i++) {
- ADDC(carry, c[i], p503.prime_x2[i] & mask, carry, c[i]);
+ ADDC(carry, c[i], params.prime_x2[i] & mask, carry, c[i]);
}
}
void sike_fpsub(const felm_t a, const felm_t b, felm_t c)
-{ // Modular subtraction, c = a-b mod p503.
- // Inputs: a, b in [0, 2*p503-1]
- // Output: c in [0, 2*p503-1]
+{ // Modular subtraction, c = a-b mod p434.
+ // Inputs: a, b in [0, 2*p434-1]
+ // Output: c in [0, 2*p434-1]
unsigned int i, borrow = 0;
crypto_word_t mask;
@@ -86,7 +86,7 @@
borrow = 0;
for (i = 0; i < NWORDS_FIELD; i++) {
- ADDC(borrow, c[i], p503.prime_x2[i] & mask, borrow, c[i]);
+ ADDC(borrow, c[i], params.prime_x2[i] & mask, borrow, c[i]);
}
}
@@ -124,12 +124,12 @@
c[2*NWORDS_FIELD-1] = v;
}
-void sike_fprdc(const felm_t ma, felm_t mc)
-{ // Efficient Montgomery reduction using comba and exploiting the special form of the prime p503.
- // mc = ma*R^-1 mod p503x2, where R = 2^512.
- // If ma < 2^512*p503, the output mc is in the range [0, 2*p503-1].
+void sike_fprdc(felm_t ma, felm_t mc)
+{ // Efficient Montgomery reduction using comba and exploiting the special form of the prime p434.
+ // mc = ma*R^-1 mod p434x2, where R = 2^448.
+ // If ma < 2^448*p434, the output mc is in the range [0, 2*p434-1].
// ma is assumed to be in Montgomery representation.
- unsigned int i, j, carry, count = p503_ZERO_WORDS;
+ unsigned int i, j, carry, count = ZERO_WORDS;
crypto_word_t UV[2], t = 0, u = 0, v = 0;
for (i = 0; i < NWORDS_FIELD; i++) {
@@ -138,8 +138,8 @@
for (i = 0; i < NWORDS_FIELD; i++) {
for (j = 0; j < i; j++) {
- if (j < (i-p503_ZERO_WORDS+1)) {
- MUL(mc[j], p503.prime_p1[i-j], UV+1, UV[0]);
+ if (j < (i-ZERO_WORDS+1)) {
+ MUL(mc[j], params.prime_p1[i-j], UV+1, UV[0]);
ADDC(0, UV[0], v, carry, v);
ADDC(carry, UV[1], u, carry, u);
t += carry;
@@ -160,7 +160,7 @@
}
for (j = i-NWORDS_FIELD+1; j < NWORDS_FIELD; j++) {
if (j < (NWORDS_FIELD-count)) {
- MUL(mc[j], p503.prime_p1[i-j], UV+1, UV[0]);
+ MUL(mc[j], params.prime_p1[i-j], UV+1, UV[0]);
ADDC(0, UV[0], v, carry, v);
ADDC(carry, UV[1], u, carry, u);
t += carry;
diff --git a/third_party/sike/fpx.c b/third_party/sike/fpx.c
index 0951418..d85875d 100644
--- a/third_party/sike/fpx.c
+++ b/third_party/sike/fpx.c
@@ -8,7 +8,7 @@
#include "utils.h"
#include "fpx.h"
-extern const struct params_t p503;
+extern const struct params_t params;
// Multiprecision squaring, c = a^2 mod p.
static void fpsqr_mont(const felm_t ma, felm_t mc)
@@ -22,101 +22,79 @@
static void fpinv_chain_mont(felm_t a)
{
unsigned int i, j;
- felm_t t[15], tt;
+ felm_t t[31], tt;
// Precomputed table
fpsqr_mont(a, tt);
sike_fpmul_mont(a, tt, t[0]);
- for (i = 0; i <= 13; i++) sike_fpmul_mont(t[i], tt, t[i+1]);
+ for (i = 0; i <= 29; i++) sike_fpmul_mont(t[i], tt, t[i+1]);
sike_fpcopy(a, tt);
- for (i = 0; i < 8; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(a, tt, tt);
- for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[8], tt, tt);
- for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[6], tt, tt);
- for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[9], tt, tt);
for (i = 0; i < 7; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[0], tt, tt);
- for (i = 0; i < 7; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(a, tt, tt);
- for (i = 0; i < 7; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[6], tt, tt);
- for (i = 0; i < 7; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[2], tt, tt);
- for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[8], tt, tt);
- for (i = 0; i < 7; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(a, tt, tt);
- for (i = 0; i < 8; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[10], tt, tt);
- for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[0], tt, tt);
- for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[10], tt, tt);
- for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[10], tt, tt);
- for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
sike_fpmul_mont(t[5], tt, tt);
- for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[2], tt, tt);
- for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[6], tt, tt);
- for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
+ for (i = 0; i < 10; i++) fpsqr_mont(tt, tt);
+ sike_fpmul_mont(t[14], tt, tt);
+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
sike_fpmul_mont(t[3], tt, tt);
for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[5], tt, tt);
- for (i = 0; i < 12; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[12], tt, tt);
- for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[8], tt, tt);
- for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[6], tt, tt);
- for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[12], tt, tt);
+ sike_fpmul_mont(t[23], tt, tt);
for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[11], tt, tt);
- for (i = 0; i < 8; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[6], tt, tt);
- for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[5], tt, tt);
- for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[14], tt, tt);
- for (i = 0; i < 7; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[14], tt, tt);
- for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[5], tt, tt);
- for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[6], tt, tt);
- for (i = 0; i < 8; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[8], tt, tt);
- for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(a, tt, tt);
- for (i = 0; i < 8; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[4], tt, tt);
- for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[6], tt, tt);
- for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[5], tt, tt);
- for (i = 0; i < 8; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[7], tt, tt);
- for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(a, tt, tt);
- for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[0], tt, tt);
- for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[11], tt, tt);
- for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
sike_fpmul_mont(t[13], tt, tt);
+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
+ sike_fpmul_mont(t[24], tt, tt);
+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
+ sike_fpmul_mont(t[7], tt, tt);
for (i = 0; i < 8; i++) fpsqr_mont(tt, tt);
+ sike_fpmul_mont(t[12], tt, tt);
+ for (i = 0; i < 8; i++) fpsqr_mont(tt, tt);
+ sike_fpmul_mont(t[30], tt, tt);
+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
sike_fpmul_mont(t[1], tt, tt);
for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
+ sike_fpmul_mont(t[30], tt, tt);
+ for (i = 0; i < 7; i++) fpsqr_mont(tt, tt);
+ sike_fpmul_mont(t[21], tt, tt);
+ for (i = 0; i < 9; i++) fpsqr_mont(tt, tt);
+ sike_fpmul_mont(t[2], tt, tt);
+ for (i = 0; i < 9; i++) fpsqr_mont(tt, tt);
+ sike_fpmul_mont(t[19], tt, tt);
+ for (i = 0; i < 9; i++) fpsqr_mont(tt, tt);
+ sike_fpmul_mont(t[1], tt, tt);
+ for (i = 0; i < 7; i++) fpsqr_mont(tt, tt);
+ sike_fpmul_mont(t[24], tt, tt);
+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
+ sike_fpmul_mont(t[26], tt, tt);
+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
+ sike_fpmul_mont(t[16], tt, tt);
+ for (i = 0; i < 7; i++) fpsqr_mont(tt, tt);
sike_fpmul_mont(t[10], tt, tt);
- for (j = 0; j < 49; j++) {
- for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
- sike_fpmul_mont(t[14], tt, tt);
+ for (i = 0; i < 7; i++) fpsqr_mont(tt, tt);
+ sike_fpmul_mont(t[6], tt, tt);
+ for (i = 0; i < 7; i++) fpsqr_mont(tt, tt);
+ sike_fpmul_mont(t[0], tt, tt);
+ for (i = 0; i < 9; i++) fpsqr_mont(tt, tt);
+ sike_fpmul_mont(t[20], tt, tt);
+ for (i = 0; i < 8; i++) fpsqr_mont(tt, tt);
+ sike_fpmul_mont(t[9], tt, tt);
+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
+ sike_fpmul_mont(t[25], tt, tt);
+ for (i = 0; i < 9; i++) fpsqr_mont(tt, tt);
+ sike_fpmul_mont(t[30], tt, tt);
+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
+ sike_fpmul_mont(t[26], tt, tt);
+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
+ sike_fpmul_mont(a, tt, tt);
+ for (i = 0; i < 7; i++) fpsqr_mont(tt, tt);
+ sike_fpmul_mont(t[28], tt, tt);
+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
+ sike_fpmul_mont(t[6], tt, tt);
+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
+ sike_fpmul_mont(t[10], tt, tt);
+ for (i = 0; i < 9; i++) fpsqr_mont(tt, tt);
+ sike_fpmul_mont(t[22], tt, tt);
+ for (j = 0; j < 35; j++) {
+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
+ sike_fpmul_mont(t[30], tt, tt);
}
sike_fpcopy(tt, a);
}
@@ -190,7 +168,7 @@
}
}
-// Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod p503, where R=2^768
+// Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod prime, where R=2^768
void sike_fpmul_mont(const felm_t ma, const felm_t mb, felm_t mc)
{
dfelm_t temp = {0};
@@ -227,7 +205,7 @@
void sike_fpneg(felm_t a) {
uint32_t borrow = 0;
for (size_t i = 0; i < NWORDS_FIELD; i++) {
- SUBC(borrow, p503.prime_x2[i], a[i], borrow, a[i]);
+ SUBC(borrow, params.prime_x2[i], a[i], borrow, a[i]);
}
}
@@ -240,7 +218,7 @@
mask = 0 - (crypto_word_t)(a[0] & 1); // If a is odd compute a+p503
for (size_t i = 0; i < NWORDS_FIELD; i++) {
- ADDC(carry, a[i], p503.prime[i] & mask, carry, c[i]);
+ ADDC(carry, a[i], params.prime[i] & mask, carry, c[i]);
}
// Multiprecision right shift by one.
@@ -256,13 +234,13 @@
crypto_word_t mask;
for (size_t i = 0; i < NWORDS_FIELD; i++) {
- SUBC(borrow, a[i], p503.prime[i], borrow, a[i]);
+ SUBC(borrow, a[i], params.prime[i], borrow, a[i]);
}
mask = 0 - (crypto_word_t)borrow;
borrow = 0;
for (size_t i = 0; i < NWORDS_FIELD; i++) {
- ADDC(borrow, a[i], p503.prime[i] & mask, borrow, a[i]);
+ ADDC(borrow, a[i], params.prime[i] & mask, borrow, a[i]);
}
}
@@ -283,7 +261,7 @@
mask = mp_subfast(tt1, tt2, tt1); // tt1 = a0*b0 - a1*b1. If tt1 < 0 then mask = 0xFF..F, else if tt1 >= 0 then mask = 0x00..0
for (size_t i = 0; i < NWORDS_FIELD; i++) {
- t1[i] = p503.prime[i] & mask;
+ t1[i] = params.prime[i] & mask;
}
sike_fprdc(tt3, c->c1); // c[1] = (a0+a1)*(b0+b1) - a0*b0 - a1*b1
diff --git a/third_party/sike/fpx.h b/third_party/sike/fpx.h
index e787c28..c4c45bd 100644
--- a/third_party/sike/fpx.h
+++ b/third_party/sike/fpx.h
@@ -7,25 +7,26 @@
extern "C" {
#endif
-// Modular addition, c = a+b mod p503.
+// Modular addition, c = a+b mod p.
void sike_fpadd(const felm_t a, const felm_t b, felm_t c);
-// Modular subtraction, c = a-b mod p503.
+// Modular subtraction, c = a-b mod p.
void sike_fpsub(const felm_t a, const felm_t b, felm_t c);
-// Modular division by two, c = a/2 mod p503.
+// Modular division by two, c = a/2 mod p.
void sike_fpdiv2(const felm_t a, felm_t c);
-// Modular correction to reduce field element a in [0, 2*p503-1] to [0, p503-1].
+// Modular correction to reduce field element a in [0, 2*p-1] to [0, p-1].
void sike_fpcorrection(felm_t a);
// Multiprecision multiply, c = a*b, where lng(a) = lng(b) = nwords.
void sike_mpmul(const felm_t a, const felm_t b, dfelm_t c);
-// 503-bit Montgomery reduction, c = a mod p
-void sike_fprdc(const dfelm_t a, felm_t c);
-// Double 2x503-bit multiprecision subtraction, c = c-a-b
+// 443-bit Montgomery reduction, c = a mod p. Buffer 'a' is modified after
+// call returns.
+void sike_fprdc(dfelm_t a, felm_t c);
+// Double 2x443-bit multiprecision subtraction, c = c-a-b
void sike_mpdblsubx2_asm(const felm_t a, const felm_t b, felm_t c);
// Multiprecision subtraction, c = a-b
crypto_word_t sike_mpsubx2_asm(const dfelm_t a, const dfelm_t b, dfelm_t c);
-// 503-bit multiprecision addition, c = a+b
+// 443-bit multiprecision addition, c = a+b
void sike_mpadd_asm(const felm_t a, const felm_t b, felm_t c);
-// Modular negation, a = -a mod p503.
+// Modular negation, a = -a mod p.
void sike_fpneg(felm_t a);
// Copy of a field element, c = a
void sike_fpcopy(const felm_t a, felm_t c);
@@ -36,11 +37,11 @@
// Conversion from Montgomery representation to standard representation,
// c = ma*R^(-1) mod p = a mod p, where ma in [0, p-1].
void sike_from_mont(const felm_t ma, felm_t c);
-// Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod p503, where R=2^768
+// Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod p443, where R=2^768
void sike_fpmul_mont(const felm_t ma, const felm_t mb, felm_t mc);
-// GF(p503^2) multiplication using Montgomery arithmetic, c = a*b in GF(p503^2)
+// GF(p443^2) multiplication using Montgomery arithmetic, c = a*b in GF(p443^2)
void sike_fp2mul_mont(const f2elm_t a, const f2elm_t b, f2elm_t c);
-// GF(p503^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2)
+// GF(p443^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2)
void sike_fp2inv_mont(f2elm_t a);
// GF(p^2) squaring using Montgomery arithmetic, c = a^2 in GF(p^2).
void sike_fp2sqr_mont(const f2elm_t a, f2elm_t c);
@@ -97,8 +98,8 @@
// mc_i = a_i*R^2*R^(-1) = a_i*R in GF(p^2).
#define sike_to_fp2mont(a, mc) \
do { \
- sike_fpmul_mont(a->c0, p503.mont_R2, mc->c0); \
- sike_fpmul_mont(a->c1, p503.mont_R2, mc->c1); \
+ sike_fpmul_mont(a->c0, params.mont_R2, mc->c0); \
+ sike_fpmul_mont(a->c1, params.mont_R2, mc->c1); \
} while(0)
// Conversion of a GF(p^2) element from Montgomery representation to standard representation,
diff --git a/third_party/sike/isogeny.c b/third_party/sike/isogeny.c
index b8807f3..edb1363 100644
--- a/third_party/sike/isogeny.c
+++ b/third_party/sike/isogeny.c
@@ -189,8 +189,8 @@
// Output: the coefficient A corresponding to the curve E_A: y^2=x^3+A*x^2+x.
f2elm_t t0, t1, one = F2ELM_INIT;
- extern const struct params_t p503;
- sike_fpcopy(p503.mont_one, one->c0);
+ extern const struct params_t params;
+ sike_fpcopy(params.mont_one, one->c0);
sike_fp2add(xP, xQ, t1); // t1 = xP+xQ
sike_fp2mul_mont(xP, xQ, t0); // t0 = xP*xQ
sike_fp2mul_mont(xR, t1, A); // A = xR*t1
diff --git a/third_party/sike/params.c b/third_party/sike/params.c
new file mode 100644
index 0000000..b13f4c8
--- /dev/null
+++ b/third_party/sike/params.c
@@ -0,0 +1,128 @@
+/********************************************************************************************
+* SIDH: an efficient supersingular isogeny cryptography library
+*
+* Abstract: supersingular isogeny parameters and generation of functions for P434
+*********************************************************************************************/
+
+#include "utils.h"
+
+// Parameters for isogeny system "SIKE"
+const struct params_t params = {
+ .prime = {
+ U64_TO_WORDS(0xFFFFFFFFFFFFFFFF), U64_TO_WORDS(0xFFFFFFFFFFFFFFFF),
+ U64_TO_WORDS(0xFFFFFFFFFFFFFFFF), U64_TO_WORDS(0xFDC1767AE2FFFFFF),
+ U64_TO_WORDS(0x7BC65C783158AEA3), U64_TO_WORDS(0x6CFC5FD681C52056),
+ U64_TO_WORDS(0x0002341F27177344)
+ },
+ .prime_p1 = {
+ U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000),
+ U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0xFDC1767AE3000000),
+ U64_TO_WORDS(0x7BC65C783158AEA3), U64_TO_WORDS(0x6CFC5FD681C52056),
+ U64_TO_WORDS(0x0002341F27177344)
+ },
+ .prime_x2 = {
+ U64_TO_WORDS(0xFFFFFFFFFFFFFFFE), U64_TO_WORDS(0xFFFFFFFFFFFFFFFF),
+ U64_TO_WORDS(0xFFFFFFFFFFFFFFFF), U64_TO_WORDS(0xFB82ECF5C5FFFFFF),
+ U64_TO_WORDS(0xF78CB8F062B15D47), U64_TO_WORDS(0xD9F8BFAD038A40AC),
+ U64_TO_WORDS(0x0004683E4E2EE688)
+ },
+ .A_gen = {
+ U64_TO_WORDS(0x05ADF455C5C345BF), U64_TO_WORDS(0x91935C5CC767AC2B),
+ U64_TO_WORDS(0xAFE4E879951F0257), U64_TO_WORDS(0x70E792DC89FA27B1),
+ U64_TO_WORDS(0xF797F526BB48C8CD), U64_TO_WORDS(0x2181DB6131AF621F),
+ U64_TO_WORDS(0x00000A1C08B1ECC4), // XPA0
+ U64_TO_WORDS(0x74840EB87CDA7788), U64_TO_WORDS(0x2971AA0ECF9F9D0B),
+ U64_TO_WORDS(0xCB5732BDF41715D5), U64_TO_WORDS(0x8CD8E51F7AACFFAA),
+ U64_TO_WORDS(0xA7F424730D7E419F), U64_TO_WORDS(0xD671EB919A179E8C),
+ U64_TO_WORDS(0x0000FFA26C5A924A), // XPA1
+ U64_TO_WORDS(0xFEC6E64588B7273B), U64_TO_WORDS(0xD2A626D74CBBF1C6),
+ U64_TO_WORDS(0xF8F58F07A78098C7), U64_TO_WORDS(0xE23941F470841B03),
+ U64_TO_WORDS(0x1B63EDA2045538DD), U64_TO_WORDS(0x735CFEB0FFD49215),
+ U64_TO_WORDS(0x0001C4CB77542876), // XQA0
+ U64_TO_WORDS(0xADB0F733C17FFDD6), U64_TO_WORDS(0x6AFFBD037DA0A050),
+ U64_TO_WORDS(0x680EC43DB144E02F), U64_TO_WORDS(0x1E2E5D5FF524E374),
+ U64_TO_WORDS(0xE2DDA115260E2995), U64_TO_WORDS(0xA6E4B552E2EDE508),
+ U64_TO_WORDS(0x00018ECCDDF4B53E), // XQA1
+ U64_TO_WORDS(0x01BA4DB518CD6C7D), U64_TO_WORDS(0x2CB0251FE3CC0611),
+ U64_TO_WORDS(0x259B0C6949A9121B), U64_TO_WORDS(0x60E17AC16D2F82AD),
+ U64_TO_WORDS(0x3AA41F1CE175D92D), U64_TO_WORDS(0x413FBE6A9B9BC4F3),
+ U64_TO_WORDS(0x00022A81D8D55643), // XRA0
+ U64_TO_WORDS(0xB8ADBC70FC82E54A), U64_TO_WORDS(0xEF9CDDB0D5FADDED),
+ U64_TO_WORDS(0x5820C734C80096A0), U64_TO_WORDS(0x7799994BAA96E0E4),
+ U64_TO_WORDS(0x044961599E379AF8), U64_TO_WORDS(0xDB2B94FBF09F27E2),
+ U64_TO_WORDS(0x0000B87FC716C0C6) // XRA1
+ },
+ .B_gen = {
+ U64_TO_WORDS(0x6E5497556EDD48A3), U64_TO_WORDS(0x2A61B501546F1C05),
+ U64_TO_WORDS(0xEB919446D049887D), U64_TO_WORDS(0x5864A4A69D450C4F),
+ U64_TO_WORDS(0xB883F276A6490D2B), U64_TO_WORDS(0x22CC287022D5F5B9),
+ U64_TO_WORDS(0x0001BED4772E551F), // XPB0
+ U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000),
+ U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000),
+ U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000),
+ U64_TO_WORDS(0x0000000000000000), // XPB1
+ U64_TO_WORDS(0xFAE2A3F93D8B6B8E), U64_TO_WORDS(0x494871F51700FE1C),
+ U64_TO_WORDS(0xEF1A94228413C27C), U64_TO_WORDS(0x498FF4A4AF60BD62),
+ U64_TO_WORDS(0xB00AD2A708267E8A), U64_TO_WORDS(0xF4328294E017837F),
+ U64_TO_WORDS(0x000034080181D8AE), // XQB0
+ U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000),
+ U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000),
+ U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000),
+ U64_TO_WORDS(0x0000000000000000), // XQB1
+ U64_TO_WORDS(0x283B34FAFEFDC8E4), U64_TO_WORDS(0x9208F44977C3E647),
+ U64_TO_WORDS(0x7DEAE962816F4E9A), U64_TO_WORDS(0x68A2BA8AA262EC9D),
+ U64_TO_WORDS(0x8176F112EA43F45B), U64_TO_WORDS(0x02106D022634F504),
+ U64_TO_WORDS(0x00007E8A50F02E37), // XRB0
+ U64_TO_WORDS(0xB378B7C1DA22CCB1), U64_TO_WORDS(0x6D089C99AD1D9230),
+ U64_TO_WORDS(0xEBE15711813E2369), U64_TO_WORDS(0x2B35A68239D48A53),
+ U64_TO_WORDS(0x445F6FD138407C93), U64_TO_WORDS(0xBEF93B29A3F6B54B),
+ U64_TO_WORDS(0x000173FA910377D3) // XRB1
+ },
+ .mont_R2 = {
+ U64_TO_WORDS(0x28E55B65DCD69B30), U64_TO_WORDS(0xACEC7367768798C2),
+ U64_TO_WORDS(0xAB27973F8311688D), U64_TO_WORDS(0x175CC6AF8D6C7C0B),
+ U64_TO_WORDS(0xABCD92BF2DDE347E), U64_TO_WORDS(0x69E16A61C7686D9A),
+ U64_TO_WORDS(0x000025A89BCDD12A)
+ },
+ .mont_one = {
+ U64_TO_WORDS(0x000000000000742C), U64_TO_WORDS(0x0000000000000000),
+ U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0xB90FF404FC000000),
+ U64_TO_WORDS(0xD801A4FB559FACD4), U64_TO_WORDS(0xE93254545F77410C),
+ U64_TO_WORDS(0x0000ECEEA7BD2EDA)
+ },
+ .mont_six = {
+ U64_TO_WORDS(0x000000000002B90A), U64_TO_WORDS(0x0000000000000000),
+ U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x5ADCCB2822000000),
+ U64_TO_WORDS(0x187D24F39F0CAFB4), U64_TO_WORDS(0x9D353A4D394145A0),
+ U64_TO_WORDS(0x00012559A0403298)
+ },
+ .A_strat = {
+ 0x30, 0x1C, 0x10, 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01,
+ 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x08, 0x04,
+ 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01,
+ 0x02, 0x01, 0x01, 0x0D, 0x07, 0x04, 0x02, 0x01, 0x01, 0x02,
+ 0x01, 0x01, 0x03, 0x02, 0x01, 0x01, 0x01, 0x01, 0x05, 0x04,
+ 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x01,
+ 0x15, 0x0C, 0x07, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01,
+ 0x03, 0x02, 0x01, 0x01, 0x01, 0x01, 0x05, 0x03, 0x02, 0x01,
+ 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x01, 0x09, 0x05, 0x03,
+ 0x02, 0x01, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x01, 0x04,
+ 0x02, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01
+ },
+ .B_strat = {
+ 0x42, 0x21, 0x11, 0x09, 0x05, 0x03, 0x02, 0x01, 0x01, 0x01,
+ 0x01, 0x02, 0x01, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x01,
+ 0x02, 0x01, 0x01, 0x08, 0x04, 0x02, 0x01, 0x01, 0x01, 0x02,
+ 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x10,
+ 0x08, 0x04, 0x02, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04,
+ 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x08, 0x04, 0x02, 0x01,
+ 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01,
+ 0x01, 0x20, 0x10, 0x08, 0x04, 0x03, 0x01, 0x01, 0x01, 0x01,
+ 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01,
+ 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02,
+ 0x01, 0x01, 0x02, 0x01, 0x01, 0x10, 0x08, 0x04, 0x02, 0x01,
+ 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01,
+ 0x01, 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04,
+ 0x02, 0x01, 0x01, 0x02, 0x01, 0x01
+ }
+};
diff --git a/third_party/sike/sike.c b/third_party/sike/sike.c
index 689baa8..f6a19be 100644
--- a/third_party/sike/sike.c
+++ b/third_party/sike/sike.c
@@ -11,65 +11,25 @@
#include <openssl/base.h>
#include <openssl/rand.h>
#include <openssl/mem.h>
-#include <openssl/hmac.h>
#include <openssl/sha.h>
#include "utils.h"
#include "isogeny.h"
#include "fpx.h"
-extern const struct params_t p503;
+extern const struct params_t params;
-// Domain separation parameters for HMAC
-static const uint8_t G[2] = {0,0};
-static const uint8_t H[2] = {1,0};
-static const uint8_t F[2] = {2,0};
-
-// SIDHp503_JINV_BYTESZ is a number of bytes used for encoding j-invariant.
-#define SIDHp503_JINV_BYTESZ 126U
-// SIDHp503_PRV_A_BITSZ is a number of bits of SIDH private key (2-isogeny)
-#define SIDHp503_PRV_A_BITSZ 250U
-// SIDHp503_PRV_A_BITSZ is a number of bits of SIDH private key (3-isogeny)
-#define SIDHp503_PRV_B_BITSZ 253U
+// SIDH_JINV_BYTESZ is a number of bytes used for encoding j-invariant.
+#define SIDH_JINV_BYTESZ 110U
+// SIDH_PRV_A_BITSZ is a number of bits of SIDH private key (2-isogeny)
+#define SIDH_PRV_A_BITSZ 216U
+// SIDH_PRV_A_BITSZ is a number of bits of SIDH private key (3-isogeny)
+#define SIDH_PRV_B_BITSZ 217U
// MAX_INT_POINTS_ALICE is a number of points used in 2-isogeny tree computation
#define MAX_INT_POINTS_ALICE 7U
// MAX_INT_POINTS_ALICE is a number of points used in 3-isogeny tree computation
#define MAX_INT_POINTS_BOB 8U
-// Produces HMAC-SHA256 of data |S| mac'ed with the key |key|. Result is stored in |out|
-// which must have size of at least |outsz| bytes and must be not bigger than
-// SHA256_DIGEST_LENGTH. The output of a HMAC may be truncated.
-// The |key| buffer is reused by the hmac_sum and hence, it's size must be equal
-// to SHA256_CBLOCK. The HMAC key provided in |key| buffer must be smaller or equal
-// to SHA256_DIGHEST_LENTH. |key| can overlap |out|.
-static void hmac_sum(
- uint8_t *out, size_t outsz, const uint8_t S[2], uint8_t key[SHA256_CBLOCK]) {
- for(size_t i=0; i<SHA256_DIGEST_LENGTH; i++) {
- key[i] = key[i] ^ 0x36;
- }
- // set rest of the buffer to ipad = 0x36
- memset(&key[SHA256_DIGEST_LENGTH], 0x36, SHA256_CBLOCK - SHA256_DIGEST_LENGTH);
-
- SHA256_CTX ctx;
- SHA256_Init(&ctx);
- SHA256_Update(&ctx, key, SHA256_CBLOCK);
- SHA256_Update(&ctx, S, 2);
- uint8_t digest[SHA256_DIGEST_LENGTH];
- SHA256_Final(digest, &ctx);
-
- // XOR key with an opad = 0x5C
- for(size_t i=0; i<SHA256_CBLOCK; i++) {
- key[i] = key[i] ^ 0x36 ^ 0x5C;
- }
-
- SHA256_Init(&ctx);
- SHA256_Update(&ctx, key, SHA256_CBLOCK);
- SHA256_Update(&ctx, digest, SHA256_DIGEST_LENGTH);
- SHA256_Final(digest, &ctx);
- assert(outsz <= sizeof(digest));
- memcpy(out, digest, outsz);
-}
-
// Swap points.
// If option = 0 then P <- P and Q <- Q, else if option = 0xFF...FF then P <- Q and Q <- P
#if !defined(OPENSSL_X86_64) || defined(OPENSSL_NO_ASM)
@@ -104,7 +64,7 @@
#endif
}
-static void LADDER3PT(
+static void ladder3Pt(
const f2elm_t xP, const f2elm_t xQ, const f2elm_t xPQ, const uint8_t* m,
int is_A, point_proj_t R, const f2elm_t A) {
point_proj_t R0 = POINT_PROJ_INIT, R2 = POINT_PROJ_INIT;
@@ -112,10 +72,10 @@
crypto_word_t mask;
int bit, swap, prevbit = 0;
- const size_t nbits = is_A?SIDHp503_PRV_A_BITSZ:SIDHp503_PRV_B_BITSZ;
+ const size_t nbits = is_A?SIDH_PRV_A_BITSZ:SIDH_PRV_B_BITSZ;
// Initializing constant
- sike_fpcopy(p503.mont_one, A24[0].c0);
+ sike_fpcopy(params.mont_one, A24[0].c0);
sike_fp2add(A24, A24, A24);
sike_fp2add(A, A24, A24);
sike_fp2div2(A24, A24);
@@ -123,11 +83,11 @@
// Initializing points
sike_fp2copy(xQ, R0->X);
- sike_fpcopy(p503.mont_one, R0->Z[0].c0);
+ sike_fpcopy(params.mont_one, R0->Z[0].c0);
sike_fp2copy(xPQ, R2->X);
- sike_fpcopy(p503.mont_one, R2->Z[0].c0);
+ sike_fpcopy(params.mont_one, R2->Z[0].c0);
sike_fp2copy(xP, R->X);
- sike_fpcopy(p503.mont_one, R->Z[0].c0);
+ sike_fpcopy(params.mont_one, R->Z[0].c0);
memset(R->Z->c1, 0, sizeof(R->Z->c1));
// Main loop
@@ -141,6 +101,9 @@
xDBLADD(R0, R2, R->X, A24);
sike_fp2mul_mont(R2->X, R->Z, R2->X);
}
+
+ mask = 0 - (crypto_word_t)prevbit;
+ sike_fp2cswap(R, R2, mask);
}
// Initialization of basis points
@@ -148,9 +111,9 @@
sike_fpcopy(gen, XP->c0);
sike_fpcopy(gen + NWORDS_FIELD, XP->c1);
sike_fpcopy(gen + 2*NWORDS_FIELD, XQ->c0);
- memset(XQ->c1, 0, sizeof(XQ->c1));
- sike_fpcopy(gen + 3*NWORDS_FIELD, XR->c0);
- sike_fpcopy(gen + 4*NWORDS_FIELD, XR->c1);
+ sike_fpcopy(gen + 3*NWORDS_FIELD, XQ->c1);
+ sike_fpcopy(gen + 4*NWORDS_FIELD, XR->c0);
+ sike_fpcopy(gen + 5*NWORDS_FIELD, XR->c1);
}
// Conversion of GF(p^2) element from Montgomery to standard representation.
@@ -195,18 +158,21 @@
unsigned int m, index = 0, pts_index[MAX_INT_POINTS_ALICE], npts = 0, ii = 0;
// Initialize basis points
- sike_init_basis(p503.A_gen, XPA, XQA, XRA);
- sike_init_basis(p503.B_gen, phiP->X, phiQ->X, phiR->X);
- sike_fpcopy(p503.mont_one, (phiP->Z)->c0);
- sike_fpcopy(p503.mont_one, (phiQ->Z)->c0);
- sike_fpcopy(p503.mont_one, (phiR->Z)->c0);
+ sike_init_basis(params.A_gen, XPA, XQA, XRA);
+ sike_init_basis(params.B_gen, phiP->X, phiQ->X, phiR->X);
+ sike_fpcopy(params.mont_one, (phiP->Z)->c0);
+ sike_fpcopy(params.mont_one, (phiQ->Z)->c0);
+ sike_fpcopy(params.mont_one, (phiR->Z)->c0);
- // Initialize constants
- sike_fpcopy(p503.mont_one, A24plus->c0);
+ // Initialize constants: A24plus = A+2C, C24 = 4C, where A=6, C=1
+ sike_fpcopy(params.mont_one, A24plus->c0);
+ sike_fp2add(A24plus, A24plus, A24plus);
sike_fp2add(A24plus, A24plus, C24);
+ sike_fp2add(A24plus, C24, A);
+ sike_fp2add(C24, C24, A24plus);
// Retrieve kernel point
- LADDER3PT(XPA, XQA, XRA, skA, 1, R, A);
+ ladder3Pt(XPA, XQA, XRA, skA, 1, R, A);
// Traverse tree
index = 0;
@@ -215,7 +181,7 @@
sike_fp2copy(R->X, pts[npts]->X);
sike_fp2copy(R->Z, pts[npts]->Z);
pts_index[npts++] = index;
- m = p503.A_strat[ii++];
+ m = params.A_strat[ii++];
xDBLe(R, R, A24plus, C24, (2*m));
index += m;
}
@@ -246,8 +212,8 @@
// Format public key
sike_fp2_encode(phiP->X, pkA);
- sike_fp2_encode(phiQ->X, pkA + SIDHp503_JINV_BYTESZ);
- sike_fp2_encode(phiR->X, pkA + 2*SIDHp503_JINV_BYTESZ);
+ sike_fp2_encode(phiQ->X, pkA + SIDH_JINV_BYTESZ);
+ sike_fp2_encode(phiR->X, pkA + 2*SIDH_JINV_BYTESZ);
}
// Bob's ephemeral key-pair generation
@@ -267,20 +233,21 @@
unsigned int m, index = 0, pts_index[MAX_INT_POINTS_BOB], npts = 0, ii = 0;
// Initialize basis points
- sike_init_basis(p503.B_gen, XPB, XQB, XRB);
- sike_init_basis(p503.A_gen, phiP->X, phiQ->X, phiR->X);
- sike_fpcopy(p503.mont_one, (phiP->Z)->c0);
- sike_fpcopy(p503.mont_one, (phiQ->Z)->c0);
- sike_fpcopy(p503.mont_one, (phiR->Z)->c0);
+ sike_init_basis(params.B_gen, XPB, XQB, XRB);
+ sike_init_basis(params.A_gen, phiP->X, phiQ->X, phiR->X);
+ sike_fpcopy(params.mont_one, (phiP->Z)->c0);
+ sike_fpcopy(params.mont_one, (phiQ->Z)->c0);
+ sike_fpcopy(params.mont_one, (phiR->Z)->c0);
- // Initialize constants
- sike_fpcopy(p503.mont_one, A24plus->c0);
+ // Initialize constants: A24minus = A-2C, A24plus = A+2C, where A=6, C=1
+ sike_fpcopy(params.mont_one, A24plus->c0);
sike_fp2add(A24plus, A24plus, A24plus);
- sike_fp2copy(A24plus, A24minus);
- sike_fp2neg(A24minus);
+ sike_fp2add(A24plus, A24plus, A24minus);
+ sike_fp2add(A24plus, A24minus, A);
+ sike_fp2add(A24minus, A24minus, A24plus);
// Retrieve kernel point
- LADDER3PT(XPB, XQB, XRB, skB, 0, R, A);
+ ladder3Pt(XPB, XQB, XRB, skB, 0, R, A);
// Traverse tree
index = 0;
@@ -289,7 +256,7 @@
sike_fp2copy(R->X, pts[npts]->X);
sike_fp2copy(R->Z, pts[npts]->Z);
pts_index[npts++] = index;
- m = p503.B_strat[ii++];
+ m = params.B_strat[ii++];
xTPLe(R, R, A24minus, A24plus, m);
index += m;
}
@@ -320,8 +287,8 @@
// Format public key
sike_fp2_encode(phiP->X, pkB);
- sike_fp2_encode(phiQ->X, pkB + SIDHp503_JINV_BYTESZ);
- sike_fp2_encode(phiR->X, pkB + 2*SIDHp503_JINV_BYTESZ);
+ sike_fp2_encode(phiQ->X, pkB + SIDH_JINV_BYTESZ);
+ sike_fp2_encode(phiR->X, pkB + 2*SIDH_JINV_BYTESZ);
}
// Alice's ephemeral shared secret computation
@@ -340,17 +307,17 @@
// Initialize images of Bob's basis
fp2_decode(pkB, PKB[0]);
- fp2_decode(pkB + SIDHp503_JINV_BYTESZ, PKB[1]);
- fp2_decode(pkB + 2*SIDHp503_JINV_BYTESZ, PKB[2]);
+ fp2_decode(pkB + SIDH_JINV_BYTESZ, PKB[1]);
+ fp2_decode(pkB + 2*SIDH_JINV_BYTESZ, PKB[2]);
// Initialize constants
- get_A(PKB[0], PKB[1], PKB[2], A); // TODO: Can return projective A?
- sike_fpadd(p503.mont_one, p503.mont_one, C24->c0);
+ get_A(PKB[0], PKB[1], PKB[2], A);
+ sike_fpadd(params.mont_one, params.mont_one, C24->c0);
sike_fp2add(A, C24, A24plus);
sike_fpadd(C24->c0, C24->c0, C24->c0);
// Retrieve kernel point
- LADDER3PT(PKB[0], PKB[1], PKB[2], skA, 1, R, A);
+ ladder3Pt(PKB[0], PKB[1], PKB[2], skA, 1, R, A);
// Traverse tree
index = 0;
@@ -359,7 +326,7 @@
sike_fp2copy(R->X, pts[npts]->X);
sike_fp2copy(R->Z, pts[npts]->Z);
pts_index[npts++] = index;
- m = p503.A_strat[ii++];
+ m = params.A_strat[ii++];
xDBLe(R, R, A24plus, C24, (2*m));
index += m;
}
@@ -376,9 +343,9 @@
}
get_4_isog(R, A24plus, C24, coeff);
- sike_fp2div2(C24, C24);
+ sike_fp2add(A24plus, A24plus, A24plus);
sike_fp2sub(A24plus, C24, A24plus);
- sike_fp2div2(C24, C24);
+ sike_fp2add(A24plus, A24plus, A24plus);
j_inv(A24plus, C24, jinv);
sike_fp2_encode(jinv, ssA);
}
@@ -399,17 +366,17 @@
// Initialize images of Alice's basis
fp2_decode(pkA, PKB[0]);
- fp2_decode(pkA + SIDHp503_JINV_BYTESZ, PKB[1]);
- fp2_decode(pkA + 2*SIDHp503_JINV_BYTESZ, PKB[2]);
+ fp2_decode(pkA + SIDH_JINV_BYTESZ, PKB[1]);
+ fp2_decode(pkA + 2*SIDH_JINV_BYTESZ, PKB[2]);
// Initialize constants
get_A(PKB[0], PKB[1], PKB[2], A);
- sike_fpadd(p503.mont_one, p503.mont_one, A24minus->c0);
+ sike_fpadd(params.mont_one, params.mont_one, A24minus->c0);
sike_fp2add(A, A24minus, A24plus);
sike_fp2sub(A, A24minus, A24minus);
// Retrieve kernel point
- LADDER3PT(PKB[0], PKB[1], PKB[2], skB, 0, R, A);
+ ladder3Pt(PKB[0], PKB[1], PKB[2], skB, 0, R, A);
// Traverse tree
index = 0;
@@ -418,7 +385,7 @@
sike_fp2copy(R->X, pts[npts]->X);
sike_fp2copy(R->Z, pts[npts]->Z);
pts_index[npts++] = index;
- m = p503.B_strat[ii++];
+ m = params.B_strat[ii++];
xTPLe(R, R, A24minus, A24plus, m);
index += m;
}
@@ -442,17 +409,17 @@
sike_fp2_encode(jinv, ssB);
}
-int SIKE_keypair(uint8_t out_priv[SIKEp503_PRV_BYTESZ],
- uint8_t out_pub[SIKEp503_PUB_BYTESZ]) {
+int SIKE_keypair(uint8_t out_priv[SIKE_PRV_BYTESZ],
+ uint8_t out_pub[SIKE_PUB_BYTESZ]) {
int ret = 0;
// Calculate private key for Alice. Needs to be in range [0, 2^0xFA - 1] and <
// 253 bits
BIGNUM *bn_sidh_prv = BN_new();
if (!bn_sidh_prv ||
- !BN_rand(bn_sidh_prv, SIDHp503_PRV_B_BITSZ, BN_RAND_TOP_ONE,
+ !BN_rand(bn_sidh_prv, SIDH_PRV_B_BITSZ, BN_RAND_TOP_ONE,
BN_RAND_BOTTOM_ANY) ||
- !BN_bn2le_padded(out_priv, BITS_TO_BYTES(SIDHp503_PRV_B_BITSZ),
+ !BN_bn2le_padded(out_priv, BITS_TO_BYTES(SIDH_PRV_B_BITSZ),
bn_sidh_prv)) {
goto end;
}
@@ -465,70 +432,67 @@
return ret;
}
-void SIKE_encaps(uint8_t out_shared_key[SIKEp503_SS_BYTESZ],
- uint8_t out_ciphertext[SIKEp503_CT_BYTESZ],
- const uint8_t pub_key[SIKEp503_PUB_BYTESZ]) {
+void SIKE_encaps(uint8_t out_shared_key[SIKE_SS_BYTESZ],
+ uint8_t out_ciphertext[SIKE_CT_BYTESZ],
+ const uint8_t pub_key[SIKE_PUB_BYTESZ]) {
// Secret buffer is reused by the function to store some ephemeral
// secret data. It's size must be maximum of SHA256_CBLOCK,
- // SIKEp503_MSG_BYTESZ and SIDHp503_PRV_A_BITSZ in bytes.
+ // SIKE_MSG_BYTESZ and SIDH_PRV_A_BITSZ in bytes.
uint8_t secret[SHA256_CBLOCK];
- uint8_t j[SIDHp503_JINV_BYTESZ];
- uint8_t temp[SIKEp503_MSG_BYTESZ + SIKEp503_CT_BYTESZ];
+ uint8_t j[SIDH_JINV_BYTESZ];
+ uint8_t temp[SIKE_MSG_BYTESZ + SIKE_CT_BYTESZ];
SHA256_CTX ctx;
// Generate secret key for A
- // secret key A = HMAC({0,1}^n || pub_key), G) mod SIDHp503_PRV_A_BITSZ
- RAND_bytes(temp, SIKEp503_MSG_BYTESZ);
+ // secret key A = SHA256({0,1}^n || pub_key)) mod SIDH_PRV_A_BITSZ
+ RAND_bytes(temp, SIKE_MSG_BYTESZ);
SHA256_Init(&ctx);
- SHA256_Update(&ctx, temp, SIKEp503_MSG_BYTESZ);
- SHA256_Update(&ctx, pub_key, SIKEp503_PUB_BYTESZ);
+ SHA256_Update(&ctx, temp, SIKE_MSG_BYTESZ);
+ SHA256_Update(&ctx, pub_key, SIKE_PUB_BYTESZ);
SHA256_Final(secret, &ctx);
- hmac_sum(secret, BITS_TO_BYTES(SIDHp503_PRV_A_BITSZ), G, secret);
- secret[BITS_TO_BYTES(SIDHp503_PRV_A_BITSZ) - 1] &=
- (1 << (SIDHp503_PRV_A_BITSZ % 8)) - 1;
// Generate public key for A - first part of the ciphertext
gen_iso_A(secret, out_ciphertext);
// Generate c1:
- // h = HMAC(j-invariant(secret key A, public key B), F)
+ // h = SHA256(j-invariant)
// c1 = h ^ m
ex_iso_A(secret, pub_key, j);
SHA256_Init(&ctx);
SHA256_Update(&ctx, j, sizeof(j));
SHA256_Final(secret, &ctx);
- hmac_sum(secret, SIKEp503_MSG_BYTESZ, F, secret);
// c1 = h ^ m
- uint8_t *c1 = &out_ciphertext[SIKEp503_PUB_BYTESZ];
- for (size_t i = 0; i < SIKEp503_MSG_BYTESZ; i++) {
+ uint8_t *c1 = &out_ciphertext[SIKE_PUB_BYTESZ];
+ for (size_t i = 0; i < SIKE_MSG_BYTESZ; i++) {
c1[i] = temp[i] ^ secret[i];
}
SHA256_Init(&ctx);
- SHA256_Update(&ctx, temp, SIKEp503_MSG_BYTESZ);
- SHA256_Update(&ctx, out_ciphertext, SIKEp503_CT_BYTESZ);
+ SHA256_Update(&ctx, temp, SIKE_MSG_BYTESZ);
+ SHA256_Update(&ctx, out_ciphertext, SIKE_CT_BYTESZ);
SHA256_Final(secret, &ctx);
- // Generate shared secret out_shared_key = HMAC(m||out_ciphertext, F)
- hmac_sum(out_shared_key, SIKEp503_SS_BYTESZ, H, secret);
+ // Generate shared secret out_shared_key = SHA256(m||out_ciphertext)
+ memcpy(out_shared_key, secret, SIKE_SS_BYTESZ);
}
-void SIKE_decaps(uint8_t out_shared_key[SIKEp503_SS_BYTESZ],
- const uint8_t ciphertext[SIKEp503_CT_BYTESZ],
- const uint8_t pub_key[SIKEp503_PUB_BYTESZ],
- const uint8_t priv_key[SIKEp503_PRV_BYTESZ]) {
+void SIKE_decaps(uint8_t out_shared_key[SIKE_SS_BYTESZ],
+ const uint8_t ciphertext[SIKE_CT_BYTESZ],
+ const uint8_t pub_key[SIKE_PUB_BYTESZ],
+ const uint8_t priv_key[SIKE_PRV_BYTESZ]) {
// Secret buffer is reused by the function to store some ephemeral
// secret data. It's size must be maximum of SHA256_CBLOCK,
- // SIKEp503_MSG_BYTESZ and SIDHp503_PRV_A_BITSZ in bytes.
+ // SIKE_MSG_BYTESZ and SIDH_PRV_A_BITSZ in bytes.
uint8_t secret[SHA256_CBLOCK];
- uint8_t j[SIDHp503_JINV_BYTESZ];
- uint8_t c0[SIKEp503_PUB_BYTESZ];
- uint8_t temp[SIKEp503_MSG_BYTESZ];
- uint8_t shared_nok[SIKEp503_MSG_BYTESZ];
+ uint8_t j[SIDH_JINV_BYTESZ];
+ uint8_t c0[SIKE_PUB_BYTESZ];
+ uint8_t temp[SIKE_MSG_BYTESZ];
+ uint8_t shared_nok[SIKE_MSG_BYTESZ];
SHA256_CTX ctx;
- RAND_bytes(shared_nok, SIKEp503_MSG_BYTESZ);
+ // This is OK as we are only using ephemeral keys in BoringSSL
+ RAND_bytes(shared_nok, SIKE_MSG_BYTESZ);
// Recover m
// Let ciphertext = c0 || c1 - both have fixed sizes
@@ -538,34 +502,30 @@
SHA256_Init(&ctx);
SHA256_Update(&ctx, j, sizeof(j));
SHA256_Final(secret, &ctx);
- hmac_sum(secret, SIKEp503_MSG_BYTESZ, F, secret);
const uint8_t *c1 = &ciphertext[sizeof(c0)];
- for (size_t i = 0; i < SIKEp503_MSG_BYTESZ; i++) {
+ for (size_t i = 0; i < SIKE_MSG_BYTESZ; i++) {
temp[i] = c1[i] ^ secret[i];
}
SHA256_Init(&ctx);
- SHA256_Update(&ctx, temp, SIKEp503_MSG_BYTESZ);
- SHA256_Update(&ctx, pub_key, SIKEp503_PUB_BYTESZ);
+ SHA256_Update(&ctx, temp, SIKE_MSG_BYTESZ);
+ SHA256_Update(&ctx, pub_key, SIKE_PUB_BYTESZ);
SHA256_Final(secret, &ctx);
- hmac_sum(secret, BITS_TO_BYTES(SIDHp503_PRV_A_BITSZ), G, secret);
-
- // Recover secret key A = G(m||pub_key) mod
- secret[BITS_TO_BYTES(SIDHp503_PRV_A_BITSZ) - 1] &=
- (1 << (SIDHp503_PRV_A_BITSZ % 8)) - 1;
// Recover c0 = public key A
gen_iso_A(secret, c0);
crypto_word_t ok = constant_time_is_zero_w(
- CRYPTO_memcmp(c0, ciphertext, SIKEp503_PUB_BYTESZ));
- for (size_t i = 0; i < SIKEp503_MSG_BYTESZ; i++) {
+ CRYPTO_memcmp(c0, ciphertext, SIKE_PUB_BYTESZ));
+ for (size_t i = 0; i < SIKE_MSG_BYTESZ; i++) {
temp[i] = constant_time_select_8(ok, temp[i], shared_nok[i]);
}
SHA256_Init(&ctx);
- SHA256_Update(&ctx, temp, SIKEp503_MSG_BYTESZ);
- SHA256_Update(&ctx, ciphertext, SIKEp503_CT_BYTESZ);
+ SHA256_Update(&ctx, temp, SIKE_MSG_BYTESZ);
+ SHA256_Update(&ctx, ciphertext, SIKE_CT_BYTESZ);
SHA256_Final(secret, &ctx);
- hmac_sum(out_shared_key, SIKEp503_SS_BYTESZ, H, secret);
+
+ // Generate shared secret out_shared_key = SHA256(m||ciphertext)
+ memcpy(out_shared_key, secret, SIKE_SS_BYTESZ);
}
diff --git a/third_party/sike/sike.h b/third_party/sike/sike.h
index 09093cd..5819ebf 100644
--- a/third_party/sike/sike.h
+++ b/third_party/sike/sike.h
@@ -10,14 +10,14 @@
#include <stdint.h>
#include <openssl/base.h>
-#ifdef __cplusplus
+#if defined(__cplusplus)
extern "C" {
#endif
-/* SIKEp503
+/* SIKE
*
* SIKE is a isogeny based post-quantum key encapsulation mechanism. Description of the
- * algorithm is provided in [SIKE]. This implementation uses 503-bit field size. The code
+ * algorithm is provided in [SIKE]. This implementation uses 434-bit field size. The code
* is based on "Additional_Implementations" from PQC NIST submission package which can
* be found here:
* https://csrc.nist.gov/CSRC/media/Projects/Post-Quantum-Cryptography/documents/round-1/submissions/SIKE.zip
@@ -25,39 +25,39 @@
* [SIKE] https://sike.org/files/SIDH-spec.pdf
*/
-// SIKEp503_PUB_BYTESZ is the number of bytes in a public key.
-#define SIKEp503_PUB_BYTESZ 378
-// SIKEp503_PRV_BYTESZ is the number of bytes in a private key.
-#define SIKEp503_PRV_BYTESZ 32
-// SIKEp503_SS_BYTESZ is the number of bytes in a shared key.
-#define SIKEp503_SS_BYTESZ 16
-// SIKEp503_MSG_BYTESZ is the number of bytes in a random bit string concatenated
+// SIKE_PUB_BYTESZ is the number of bytes in a public key.
+#define SIKE_PUB_BYTESZ 330
+// SIKE_PRV_BYTESZ is the number of bytes in a private key.
+#define SIKE_PRV_BYTESZ 28
+// SIKE_SS_BYTESZ is the number of bytes in a shared key.
+#define SIKE_SS_BYTESZ 16
+// SIKE_MSG_BYTESZ is the number of bytes in a random bit string concatenated
// with the public key (see 1.4 of SIKE).
-#define SIKEp503_MSG_BYTESZ 24
-// SIKEp503_SS_BYTESZ is the number of bytes in a ciphertext.
-#define SIKEp503_CT_BYTESZ (SIKEp503_PUB_BYTESZ + SIKEp503_MSG_BYTESZ)
+#define SIKE_MSG_BYTESZ 16
+// SIKE_SS_BYTESZ is the number of bytes in a ciphertext.
+#define SIKE_CT_BYTESZ (SIKE_PUB_BYTESZ + SIKE_MSG_BYTESZ)
// SIKE_keypair outputs a public and secret key. Internally it uses BN_rand() as
// an entropy source. In case of success function returns 1, otherwise 0.
OPENSSL_EXPORT int SIKE_keypair(
- uint8_t out_priv[SIKEp503_PRV_BYTESZ],
- uint8_t out_pub[SIKEp503_PUB_BYTESZ]);
+ uint8_t out_priv[SIKE_PRV_BYTESZ],
+ uint8_t out_pub[SIKE_PUB_BYTESZ]);
// SIKE_encaps generates and encrypts a random session key, writing those values to
// |out_shared_key| and |out_ciphertext|, respectively.
OPENSSL_EXPORT void SIKE_encaps(
- uint8_t out_shared_key[SIKEp503_SS_BYTESZ],
- uint8_t out_ciphertext[SIKEp503_CT_BYTESZ],
- const uint8_t pub_key[SIKEp503_PUB_BYTESZ]);
+ uint8_t out_shared_key[SIKE_SS_BYTESZ],
+ uint8_t out_ciphertext[SIKE_CT_BYTESZ],
+ const uint8_t pub_key[SIKE_PUB_BYTESZ]);
// SIKE_decaps outputs a random session key, writing it to |out_shared_key|.
OPENSSL_EXPORT void SIKE_decaps(
- uint8_t out_shared_key[SIKEp503_SS_BYTESZ],
- const uint8_t ciphertext[SIKEp503_CT_BYTESZ],
- const uint8_t pub_key[SIKEp503_PUB_BYTESZ],
- const uint8_t priv_key[SIKEp503_PRV_BYTESZ]);
+ uint8_t out_shared_key[SIKE_SS_BYTESZ],
+ const uint8_t ciphertext[SIKE_CT_BYTESZ],
+ const uint8_t pub_key[SIKE_PUB_BYTESZ],
+ const uint8_t priv_key[SIKE_PRV_BYTESZ]);
-#ifdef __cplusplus
+#if defined(__cplusplus)
}
#endif
diff --git a/third_party/sike/sike_test.cc b/third_party/sike/sike_test.cc
index 1277e09..2180a52 100644
--- a/third_party/sike/sike_test.cc
+++ b/third_party/sike/sike_test.cc
@@ -12,202 +12,189 @@
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+#include <gtest/gtest.h>
#include <stdint.h>
-#include <gtest/gtest.h>
-
-#include "../../crypto/test/abi_test.h"
#include "sike.h"
#include "fpx.h"
+#include "../../crypto/test/abi_test.h"
TEST(SIKE, RoundTrip) {
- uint8_t sk[SIKEp503_PRV_BYTESZ] = {0};
- uint8_t pk[SIKEp503_PUB_BYTESZ] = {0};
- uint8_t ct[SIKEp503_CT_BYTESZ] = {0};
- uint8_t ss_enc[SIKEp503_SS_BYTESZ] = {0};
- uint8_t ss_dec[SIKEp503_SS_BYTESZ] = {0};
+ uint8_t sk[SIKE_PRV_BYTESZ] = {0};
+ uint8_t pk[SIKE_PUB_BYTESZ] = {0};
+ uint8_t ct[SIKE_CT_BYTESZ] = {0};
+ uint8_t ss_enc[SIKE_SS_BYTESZ] = {0};
+ uint8_t ss_dec[SIKE_SS_BYTESZ] = {0};
+ for (size_t i = 0; i < 30; i++) {
EXPECT_EQ(SIKE_keypair(sk, pk), 1);
SIKE_encaps(ss_enc, ct, pk);
SIKE_decaps(ss_dec, ct, pk, sk);
- EXPECT_EQ(memcmp(ss_enc, ss_dec, SIKEp503_SS_BYTESZ), 0);
+ EXPECT_EQ(memcmp(ss_enc, ss_dec, SIKE_SS_BYTESZ), 0);
+ }
}
TEST(SIKE, Decapsulation) {
- const uint8_t sk[SIKEp503_PRV_BYTESZ] = {
- 0xDB, 0xAF, 0x2C, 0x89, 0xCA, 0x5A, 0xD4, 0x9D, 0x4F, 0x13,
- 0x40, 0xDF, 0x2D, 0xB1, 0x5F, 0x4C, 0x91, 0xA7, 0x1F, 0x0B,
- 0x29, 0x15, 0x01, 0x59, 0xBC, 0x5F, 0x0B, 0x4A, 0x03, 0x27,
- 0x6F, 0x18};
+ const uint8_t sk[SIKE_PRV_BYTESZ] = {
+ 0xB1, 0xFD, 0x34, 0x42, 0xDB, 0x02, 0xBC, 0x9D, 0x4C, 0xD0,
+ 0x72, 0x34, 0x4D, 0xBD, 0x06, 0xDF, 0x1C, 0x7D, 0x0A, 0x88,
+ 0xB2, 0x50, 0xC4, 0xF6, 0xAE, 0xE8, 0x25, 0x01};
- const uint8_t pk[SIKEp503_PUB_BYTESZ] = {
- 0x07, 0xAA, 0x51, 0x45, 0x3E, 0x1F, 0x53, 0x2A, 0x0A, 0x05,
- 0x46, 0xF6, 0x54, 0x7F, 0x5D, 0x56, 0xD6, 0x76, 0xD3, 0xEA,
- 0x4B, 0x6B, 0x01, 0x9B, 0x11, 0x72, 0x6F, 0x75, 0xEA, 0x34,
- 0x3C, 0x28, 0x2C, 0x36, 0xFD, 0x77, 0xDA, 0xBE, 0xB6, 0x20,
- 0x18, 0xC1, 0x93, 0x98, 0x18, 0x86, 0x30, 0x2F, 0x2E, 0xD2,
- 0x00, 0x61, 0xFF, 0xAE, 0x78, 0xAE, 0xFB, 0x6F, 0x32, 0xAC,
- 0x06, 0xBF, 0x35, 0xF6, 0xF7, 0x5B, 0x98, 0x26, 0x95, 0xC2,
- 0xD8, 0xD6, 0x1C, 0x0E, 0x47, 0xDA, 0x76, 0xCE, 0xB5, 0xF1,
- 0x19, 0xCC, 0x01, 0xE1, 0x17, 0xA9, 0x62, 0xF7, 0x82, 0x6C,
- 0x25, 0x51, 0x25, 0xAE, 0xFE, 0xE3, 0xE2, 0xE1, 0x35, 0xAE,
- 0x2E, 0x8F, 0x38, 0xE0, 0x7C, 0x74, 0x3C, 0x1D, 0x39, 0x91,
- 0x1B, 0xC7, 0x9F, 0x8E, 0x33, 0x4E, 0x84, 0x19, 0xB8, 0xD9,
- 0xC2, 0x71, 0x35, 0x02, 0x47, 0x3E, 0x79, 0xEF, 0x47, 0xE1,
- 0xD8, 0x21, 0x96, 0x1F, 0x11, 0x59, 0x39, 0x34, 0x76, 0xEF,
- 0x3E, 0xB7, 0x4E, 0xFB, 0x7C, 0x55, 0xA1, 0x85, 0xAA, 0xAB,
- 0xAD, 0xF0, 0x09, 0xCB, 0xD1, 0xE3, 0x7C, 0x4F, 0x5D, 0x2D,
- 0xE1, 0x13, 0xF0, 0x71, 0xD9, 0xE5, 0xF6, 0xAF, 0x7F, 0xC1,
- 0x27, 0x95, 0x8D, 0x52, 0xD5, 0x96, 0x42, 0x38, 0x41, 0xF7,
- 0x24, 0x3F, 0x3A, 0xB5, 0x7E, 0x11, 0xE4, 0xF9, 0x33, 0xEE,
- 0x4D, 0xBE, 0x74, 0x48, 0xF9, 0x98, 0x04, 0x01, 0x16, 0xEB,
- 0xA9, 0x0D, 0x61, 0xC6, 0xFD, 0x4C, 0xCF, 0x98, 0x84, 0x4A,
- 0x94, 0xAC, 0x69, 0x2C, 0x02, 0x8B, 0xE3, 0xD1, 0x41, 0x0D,
- 0xF2, 0x2D, 0x46, 0x1F, 0x57, 0x1C, 0x77, 0x86, 0x18, 0xE3,
- 0x63, 0xDE, 0xF3, 0xE3, 0x02, 0x30, 0x54, 0x73, 0xAE, 0xC2,
- 0x32, 0xA2, 0xCE, 0xEB, 0xCF, 0x81, 0x46, 0x54, 0x5C, 0xF4,
- 0x5D, 0x2A, 0x03, 0x5D, 0x9C, 0xAE, 0xE0, 0x60, 0x03, 0x80,
- 0x11, 0x30, 0xA5, 0xAA, 0xD1, 0x75, 0x67, 0xE0, 0x1C, 0x2B,
- 0x6B, 0x5D, 0x83, 0xDE, 0x92, 0x9B, 0x0E, 0xD7, 0x11, 0x0F,
- 0x00, 0xC4, 0x59, 0xE4, 0x81, 0x04, 0x3B, 0xEE, 0x5C, 0x04,
- 0xD1, 0x0E, 0xD0, 0x67, 0xF5, 0xCC, 0xAA, 0x72, 0x73, 0xEA,
- 0xC4, 0x76, 0x99, 0x3B, 0x4C, 0x90, 0x2F, 0xCB, 0xD8, 0x0A,
- 0x5B, 0xEC, 0x0E, 0x0E, 0x1F, 0x59, 0xEA, 0x14, 0x8D, 0x34,
- 0x53, 0x65, 0x4C, 0x1A, 0x59, 0xA8, 0x95, 0x66, 0x60, 0xBB,
- 0xC4, 0xCC, 0x32, 0xA9, 0x8D, 0x2A, 0xAA, 0x14, 0x6F, 0x0F,
- 0x81, 0x4D, 0x32, 0x02, 0xFD, 0x33, 0x58, 0x42, 0xCF, 0xF3,
- 0x67, 0xD0, 0x9F, 0x0B, 0xB1, 0xCC, 0x18, 0xA5, 0xC4, 0x19,
- 0xB6, 0x00, 0xED, 0xFA, 0x32, 0x1A, 0x5F, 0x67, 0xC8, 0xC3,
- 0xEB, 0x0D, 0xB5, 0x9A, 0x36, 0x47, 0x82, 0x00};
+ const uint8_t pk[SIKE_PUB_BYTESZ] = {
+ 0x6D, 0x8D, 0xF5, 0x7B, 0xCD, 0x47, 0xCA, 0xCB, 0x7A, 0x38, 0xB7, 0xA6,
+ 0x90, 0xB7, 0x37, 0x03, 0xD4, 0x6F, 0x27, 0x73, 0x74, 0x17, 0x5A, 0xA4,
+ 0x0D, 0xC6, 0x81, 0xAD, 0xDB, 0xF7, 0x18, 0xB2, 0x3C, 0x30, 0xCF, 0xAA,
+ 0x08, 0x11, 0x91, 0xCC, 0x27, 0x4E, 0xF1, 0xA6, 0xB7, 0xDA, 0xD2, 0xCF,
+ 0x99, 0x7F, 0xF7, 0xE1, 0xD0, 0xCE, 0x00, 0xD2, 0x4B, 0xA4, 0x33, 0xB4,
+ 0x87, 0x01, 0x3F, 0x02, 0xF7, 0xF9, 0xDE, 0xC3, 0x60, 0x62, 0xDA, 0x3F,
+ 0x74, 0xA9, 0x44, 0xBE, 0x19, 0xD5, 0x03, 0x2A, 0x79, 0x8C, 0xA7, 0xFF,
+ 0xEA, 0xB3, 0xBB, 0xB5, 0xD4, 0x1D, 0x8F, 0x92, 0xCE, 0x62, 0x6E, 0x99,
+ 0x24, 0xD7, 0x57, 0xFA, 0xCD, 0xB6, 0xE2, 0x8E, 0xFD, 0x22, 0x0E, 0x31,
+ 0x21, 0x01, 0x8D, 0x79, 0xF8, 0x3E, 0x27, 0xEC, 0x43, 0x40, 0xDB, 0x82,
+ 0xE5, 0xEB, 0x6C, 0x97, 0x66, 0x29, 0x15, 0x68, 0xB7, 0x4D, 0x84, 0xD1,
+ 0x8A, 0x0B, 0x12, 0x36, 0x2C, 0x0C, 0x0A, 0x6E, 0x4E, 0xDE, 0xA5, 0x8A,
+ 0xDE, 0x77, 0xDD, 0x70, 0x49, 0x73, 0xAC, 0x27, 0x6D, 0x8D, 0x25, 0x9A,
+ 0xE4, 0x25, 0xE8, 0x95, 0x8F, 0xFE, 0x90, 0x3B, 0x00, 0x69, 0x20, 0xE8,
+ 0x7C, 0xA5, 0xF5, 0x79, 0xC0, 0x61, 0x51, 0x91, 0x35, 0x25, 0x3F, 0x17,
+ 0x2F, 0x70, 0x73, 0xF0, 0x89, 0xB5, 0xC8, 0x25, 0xB8, 0xE5, 0x7E, 0x34,
+ 0xDD, 0x11, 0xE5, 0xD6, 0xC3, 0xD5, 0x29, 0x89, 0xC6, 0x2C, 0x99, 0x53,
+ 0x1D, 0x2C, 0x77, 0xB0, 0xB6, 0xA1, 0xBD, 0x79, 0xFB, 0x4A, 0xC2, 0x48,
+ 0x4C, 0x62, 0x51, 0x00, 0xE3, 0x91, 0x2A, 0xCB, 0x84, 0x03, 0x5D, 0x2D,
+ 0xC8, 0x33, 0xE9, 0x14, 0xBF, 0x74, 0x21, 0xBC, 0xF4, 0x76, 0xE5, 0x42,
+ 0xB8, 0xBD, 0xE2, 0xE7, 0x20, 0x95, 0x54, 0xF2, 0xED, 0xC0, 0x79, 0x38,
+ 0x1E, 0xD2, 0xEA, 0x1A, 0x63, 0x85, 0xE7, 0x3A, 0xDA, 0xAD, 0xAB, 0x1B,
+ 0x1E, 0x19, 0x9E, 0x73, 0xD0, 0x10, 0x2E, 0x38, 0xAC, 0x8B, 0x00, 0x6A,
+ 0x30, 0x2C, 0x3D, 0x70, 0x8E, 0x39, 0x6D, 0xC0, 0x12, 0x61, 0x7D, 0x2A,
+ 0x0A, 0x04, 0x95, 0x8E, 0x09, 0x3C, 0x7B, 0xEC, 0x2E, 0xBC, 0xE8, 0xE8,
+ 0xE8, 0x37, 0x29, 0xC4, 0x7E, 0x76, 0x48, 0xB9, 0x3B, 0x72, 0xE5, 0x99,
+ 0x9B, 0xF9, 0xE3, 0x99, 0x72, 0x3F, 0x35, 0x29, 0x85, 0xE0, 0xC8, 0xBF,
+ 0xB1, 0x6B, 0xB1, 0x6E, 0x72, 0x00};
- const uint8_t ct_exp[SIKEp503_CT_BYTESZ] = {
- 0xE6, 0xB7, 0xE5, 0x7B, 0xA9, 0x19, 0xD1, 0x2C, 0xB8, 0x5C,
- 0x7B, 0x66, 0x74, 0xB0, 0x71, 0xA1, 0xFF, 0x71, 0x7F, 0x4B,
- 0xB5, 0xA6, 0xAF, 0x48, 0x32, 0x52, 0xD5, 0x82, 0xEE, 0x8A,
- 0xBB, 0x08, 0x1E, 0xF6, 0xAC, 0x91, 0xA2, 0xCB, 0x6B, 0x6A,
- 0x09, 0x2B, 0xD9, 0xC6, 0x27, 0xD6, 0x3A, 0x6B, 0x8D, 0xFC,
- 0xB8, 0x90, 0x8F, 0x72, 0xB3, 0xFA, 0x7D, 0x34, 0x7A, 0xC4,
- 0x7E, 0xE3, 0x30, 0xC5, 0xA0, 0xFE, 0x3D, 0x43, 0x14, 0x4E,
- 0x3A, 0x14, 0x76, 0x3E, 0xFB, 0xDF, 0xE3, 0xA8, 0xE3, 0x5E,
- 0x38, 0xF2, 0xE0, 0x39, 0x67, 0x60, 0xFD, 0xFB, 0xB4, 0x19,
- 0xCD, 0xE1, 0x93, 0xA2, 0x06, 0xCC, 0x65, 0xCD, 0x6E, 0xC8,
- 0xB4, 0x5E, 0x41, 0x4B, 0x6C, 0xA5, 0xF4, 0xE4, 0x9D, 0x52,
- 0x8C, 0x25, 0x60, 0xDD, 0x3D, 0xA9, 0x7F, 0xF2, 0x88, 0xC1,
- 0x0C, 0xEE, 0x97, 0xE0, 0xE7, 0x3B, 0xB7, 0xD3, 0x6F, 0x28,
- 0x79, 0x2F, 0x50, 0xB2, 0x4F, 0x74, 0x3A, 0x0C, 0x88, 0x27,
- 0x98, 0x3A, 0x27, 0xD3, 0x26, 0x83, 0x59, 0x49, 0x81, 0x5B,
- 0x0D, 0xA7, 0x0C, 0x4F, 0xEF, 0xFB, 0x1E, 0xAF, 0xE9, 0xD2,
- 0x1C, 0x10, 0x25, 0xEC, 0x9E, 0xFA, 0x57, 0x36, 0xAA, 0x3F,
- 0xC1, 0xA3, 0x2C, 0xE9, 0xB5, 0xC9, 0xED, 0x72, 0x51, 0x4C,
- 0x02, 0xB4, 0x7B, 0xB3, 0xED, 0x9F, 0x45, 0x03, 0x34, 0xAC,
- 0x9A, 0x9E, 0x62, 0x5F, 0x82, 0x7A, 0x77, 0x34, 0xF9, 0x21,
- 0x94, 0xD2, 0x38, 0x3D, 0x05, 0xF0, 0x8A, 0x60, 0x1C, 0xB7,
- 0x1D, 0xF5, 0xB7, 0x53, 0x77, 0xD3, 0x9D, 0x3D, 0x70, 0x6A,
- 0xCB, 0x18, 0x20, 0x6B, 0x29, 0x17, 0x3A, 0x6D, 0xA1, 0xB2,
- 0x64, 0xDB, 0x6C, 0xE6, 0x1A, 0x95, 0xA7, 0xF4, 0x1A, 0x78,
- 0x1D, 0xA2, 0x40, 0x15, 0x41, 0x59, 0xDD, 0xEE, 0x23, 0x57,
- 0xCE, 0x36, 0x0D, 0x55, 0xBD, 0xB8, 0xFD, 0x0F, 0x35, 0xBD,
- 0x5B, 0x92, 0xD6, 0x1C, 0x84, 0x8C, 0x32, 0x64, 0xA6, 0x5C,
- 0x45, 0x18, 0x07, 0x6B, 0xF9, 0xA9, 0x43, 0x9A, 0x83, 0xCD,
- 0xB5, 0xB3, 0xD9, 0x17, 0x99, 0x2C, 0x2A, 0x8B, 0xE0, 0x8E,
- 0xAF, 0xA6, 0x4C, 0x95, 0xBB, 0x70, 0x60, 0x1A, 0x3A, 0x97,
- 0xAA, 0x2F, 0x3D, 0x22, 0x83, 0xB7, 0x4F, 0x59, 0xED, 0x3F,
- 0x4E, 0xF4, 0x19, 0xC6, 0x25, 0x0B, 0x0A, 0x5E, 0x21, 0xB9,
- 0x91, 0xB8, 0x19, 0x84, 0x48, 0x78, 0xCE, 0x27, 0xBF, 0x41,
- 0x89, 0xF6, 0x30, 0xFD, 0x6B, 0xD9, 0xB8, 0x1D, 0x72, 0x8A,
- 0x56, 0xCC, 0x2F, 0x82, 0xE4, 0x46, 0x4D, 0x75, 0xD8, 0x92,
- 0xE6, 0x9C, 0xCC, 0xD2, 0xCD, 0x35, 0xE4, 0xFC, 0x2A, 0x85,
- 0x6B, 0xA9, 0xB2, 0x27, 0xC9, 0xA1, 0xFF, 0xB3, 0x96, 0x3E,
- 0x59, 0xF6, 0x4C, 0x66, 0x56, 0x2E, 0xF5, 0x1B, 0x97, 0x32,
- 0xB0, 0x71, 0x5A, 0x9C, 0x50, 0x4B, 0x6F, 0xC4, 0xCA, 0x94,
- 0x75, 0x37, 0x46, 0x10, 0x12, 0x2F, 0x4F, 0xA3, 0x82, 0xCD,
- 0xBD, 0x7C};
+ const uint8_t ct[SIKE_CT_BYTESZ] = {
+ 0xFF, 0xEB, 0xEF, 0x4A, 0xC0, 0x57, 0x0F, 0x26, 0xAC, 0x76, 0xA8, 0xB0,
+ 0xA3, 0x5D, 0x9C, 0xD9, 0x25, 0xD1, 0x7F, 0x92, 0x5D, 0xF4, 0x23, 0x34,
+ 0xC3, 0x03, 0x10, 0xE1, 0xB0, 0x24, 0x9B, 0x44, 0x58, 0x26, 0x13, 0x56,
+ 0x83, 0x43, 0x72, 0x69, 0x28, 0x0D, 0x55, 0x07, 0x1F, 0xDB, 0xC0, 0x23,
+ 0x34, 0x83, 0x1A, 0x09, 0x9B, 0x80, 0x00, 0x64, 0x56, 0xDC, 0x79, 0x7A,
+ 0xD2, 0xCE, 0x23, 0xC9, 0x72, 0x27, 0xFC, 0x8D, 0xAB, 0xBF, 0xD3, 0x17,
+ 0xF6, 0x91, 0x7B, 0x15, 0x93, 0x83, 0x8A, 0x4F, 0x6C, 0xCA, 0x4A, 0x94,
+ 0xDA, 0xC7, 0x9D, 0xB6, 0xD6, 0xBA, 0xBD, 0x81, 0x9A, 0x78, 0xE5, 0xE5,
+ 0xBE, 0x17, 0xBC, 0xCB, 0xC8, 0x23, 0x80, 0x5F, 0x75, 0xF8, 0xDB, 0x51,
+ 0x55, 0x00, 0x25, 0x33, 0x52, 0x64, 0xB2, 0xD6, 0xD8, 0x9A, 0x2A, 0x9E,
+ 0x29, 0x99, 0x13, 0x33, 0xE2, 0xA7, 0x98, 0xAC, 0xD7, 0x79, 0x5C, 0x2F,
+ 0xBA, 0x07, 0xC3, 0x03, 0x37, 0xD6, 0xE6, 0xB5, 0xA1, 0xF5, 0x29, 0xB6,
+ 0xF6, 0xC0, 0x5C, 0x44, 0x68, 0x2B, 0x0B, 0xF5, 0x00, 0x01, 0x44, 0xD5,
+ 0xCC, 0x23, 0xB5, 0x27, 0x4F, 0xCA, 0xB4, 0x05, 0x01, 0xF9, 0xD4, 0x41,
+ 0xE0, 0xE1, 0x1E, 0xCF, 0xA9, 0xBC, 0x79, 0xD7, 0xD5, 0xF5, 0x3C, 0xE6,
+ 0x93, 0xF4, 0x6C, 0x84, 0x5A, 0x2C, 0x4B, 0xE4, 0x91, 0xB2, 0xB2, 0xB8,
+ 0xAD, 0x74, 0x9A, 0x69, 0x79, 0x4C, 0x84, 0xB7, 0xBF, 0xF1, 0x68, 0x4B,
+ 0xAE, 0x0F, 0x7F, 0x45, 0x3B, 0x18, 0x3F, 0xFA, 0x00, 0x48, 0xE0, 0x3A,
+ 0xE2, 0xC0, 0xAE, 0x00, 0xCE, 0x90, 0x28, 0xA4, 0x1B, 0xBE, 0xCA, 0x0C,
+ 0x21, 0x29, 0x64, 0x30, 0x5E, 0x35, 0xAD, 0xFD, 0x83, 0x47, 0x40, 0x6D,
+ 0x15, 0x56, 0xFC, 0xF8, 0x5F, 0xAB, 0x81, 0xFE, 0x6B, 0xE9, 0x6B, 0xED,
+ 0x27, 0x35, 0x7C, 0xD8, 0x2C, 0xD4, 0xF2, 0x11, 0xE6, 0xAF, 0xDF, 0xB8,
+ 0x91, 0x96, 0xEB, 0xF7, 0x4C, 0x8D, 0x70, 0x77, 0x90, 0x81, 0x00, 0x09,
+ 0x19, 0x27, 0x8A, 0x9E, 0xB6, 0x1A, 0xE9, 0xAC, 0x6C, 0xC9, 0xF8, 0xEA,
+ 0xA2, 0x34, 0xB8, 0xAC, 0xB3, 0xB3, 0x68, 0xA1, 0xB7, 0x29, 0x55, 0xCA,
+ 0x40, 0x23, 0x92, 0x5C, 0x0C, 0x79, 0x6B, 0xD6, 0x9F, 0x5B, 0xD2, 0xE6,
+ 0xAE, 0x04, 0xCB, 0xEC, 0xC7, 0x88, 0x18, 0xDB, 0x7A, 0xE6, 0xD6, 0xC9,
+ 0x39, 0xFD, 0x93, 0x9B, 0xC8, 0x01, 0x6F, 0x3E, 0x6C, 0x90, 0x3E, 0x73,
+ 0x76, 0x99, 0x7C, 0x48, 0xDA, 0x68, 0x48, 0x80, 0x2B, 0x63};
- const uint8_t ss_exp[SIKEp503_SS_BYTESZ] = {
- 0x74, 0x3D, 0x25, 0x36, 0x00, 0x24, 0x63, 0x1A, 0x39, 0x1A,
- 0xB4, 0xAD, 0x01, 0x17, 0x78, 0xE9};
+ const uint8_t ss_exp[SIKE_SS_BYTESZ] = {0xA1, 0xF9, 0x5A, 0x67, 0xB9, 0x3D,
+ 0x1E, 0x72, 0xE8, 0xC5, 0x71, 0xF1,
+ 0x4C, 0xB2, 0xAA, 0x6D};
- uint8_t ss_dec[SIKEp503_SS_BYTESZ] = {0};
- SIKE_decaps(ss_dec, ct_exp, pk, sk);
- EXPECT_EQ(memcmp(ss_dec, ss_exp, sizeof(ss_exp)), 0);
+ uint8_t ss_dec[SIKE_SS_BYTESZ] = {0};
+ SIKE_decaps(ss_dec, ct, pk, sk);
+ EXPECT_EQ(memcmp(ss_dec, ss_exp, sizeof(ss_exp)), 0);
}
// SIKE_encaps and SIKE_keypair doesn't return zeros.
TEST(SIKE, NonZero) {
- uint8_t sk[SIKEp503_PRV_BYTESZ] = {0};
- uint8_t pk[SIKEp503_PUB_BYTESZ] = {0};
- uint8_t ct[SIKEp503_CT_BYTESZ] = {0};
- uint8_t ss[SIKEp503_SS_BYTESZ] = {0};
+ uint8_t sk[SIKE_PRV_BYTESZ] = {0};
+ uint8_t pk[SIKE_PUB_BYTESZ] = {0};
+ uint8_t ct[SIKE_CT_BYTESZ] = {0};
+ uint8_t ss[SIKE_SS_BYTESZ] = {0};
- // Check secret and public key returned by SIKE_keypair
- EXPECT_EQ(SIKE_keypair(sk, pk), 1);
- uint8_t tmp = 0;
- for (size_t i=0; i<sizeof(sk); i++) tmp|=sk[i];
- EXPECT_NE(tmp, 0);
+ // Check secret and public key returned by SIKE_keypair
+ EXPECT_EQ(SIKE_keypair(sk, pk), 1);
+ uint8_t tmp = 0;
+ for (size_t i = 0; i < sizeof(sk); i++) {
+ tmp |= sk[i];
+ }
+ EXPECT_NE(tmp, 0);
- tmp = 0;
- for (size_t i=0; i<sizeof(pk); i++) tmp|=pk[i];
- EXPECT_NE(tmp, 0);
+ tmp = 0;
+ for (size_t i = 0; i < sizeof(pk); i++) {
+ tmp |= pk[i];
+ }
+ EXPECT_NE(tmp, 0);
- // Check shared secret and ciphertext returned by SIKE_encaps
- SIKE_encaps(ss, ct, pk);
- tmp = 0;
- for (size_t i=0; i<sizeof(ct); i++) tmp|=ct[i];
- EXPECT_NE(tmp, 0);
+ // Check shared secret and ciphertext returned by SIKE_encaps
+ SIKE_encaps(ss, ct, pk);
+ tmp = 0;
+ for (size_t i = 0; i < sizeof(ct); i++) {
+ tmp |= ct[i];
+ }
+ EXPECT_NE(tmp, 0);
- tmp = 0;
- for (size_t i=0; i<sizeof(ss); i++) tmp|=ss[i];
- EXPECT_NE(tmp, 0);
+ tmp = 0;
+ for (size_t i = 0; i < sizeof(ss); i++) {
+ tmp |= ss[i];
+ }
+ EXPECT_NE(tmp, 0);
}
TEST(SIKE, Negative) {
- uint8_t sk[SIKEp503_PRV_BYTESZ] = {0};
- uint8_t pk[SIKEp503_PUB_BYTESZ] = {0};
- uint8_t ct[SIKEp503_CT_BYTESZ] = {0};
- uint8_t ss_enc[SIKEp503_SS_BYTESZ] = {0};
- uint8_t ss_dec[SIKEp503_SS_BYTESZ] = {0};
+ uint8_t sk[SIKE_PRV_BYTESZ] = {0};
+ uint8_t pk[SIKE_PUB_BYTESZ] = {0};
+ uint8_t ct[SIKE_CT_BYTESZ] = {0};
+ uint8_t ss_enc[SIKE_SS_BYTESZ] = {0};
+ uint8_t ss_dec[SIKE_SS_BYTESZ] = {0};
- EXPECT_EQ(SIKE_keypair(sk, pk), 1);
- SIKE_encaps(ss_enc, ct, pk);
+ EXPECT_EQ(SIKE_keypair(sk, pk), 1);
+ SIKE_encaps(ss_enc, ct, pk);
- // Change cipertext
- uint8_t ct_tmp[SIKEp503_CT_BYTESZ] = {0};
- memcpy(ct_tmp, ct, sizeof(ct));
- ct_tmp[0] = ~ct_tmp[0];
- SIKE_decaps(ss_dec, ct_tmp, pk, sk);
- EXPECT_NE(memcmp(ss_enc, ss_dec, SIKEp503_SS_BYTESZ), 0);
+ // Change cipertext
+ uint8_t ct_tmp[SIKE_CT_BYTESZ] = {0};
+ memcpy(ct_tmp, ct, sizeof(ct));
+ ct_tmp[0] = ~ct_tmp[0];
+ SIKE_decaps(ss_dec, ct_tmp, pk, sk);
+ EXPECT_NE(memcmp(ss_enc, ss_dec, SIKE_SS_BYTESZ), 0);
- // Change secret key
- uint8_t sk_tmp[SIKEp503_PRV_BYTESZ] = {0};
- memcpy(sk_tmp, sk, sizeof(sk));
- sk_tmp[0] = ~sk_tmp[0];
- SIKE_decaps(ss_dec, ct, pk, sk_tmp);
- EXPECT_NE(memcmp(ss_enc, ss_dec, SIKEp503_SS_BYTESZ), 0);
+ // Change secret key
+ uint8_t sk_tmp[SIKE_PRV_BYTESZ] = {0};
+ memcpy(sk_tmp, sk, sizeof(sk));
+ sk_tmp[0] = ~sk_tmp[0];
+ SIKE_decaps(ss_dec, ct, pk, sk_tmp);
+ EXPECT_NE(memcmp(ss_enc, ss_dec, SIKE_SS_BYTESZ), 0);
- // Change public key
- uint8_t pk_tmp[SIKEp503_PUB_BYTESZ] = {0};
- memcpy(pk_tmp, pk, sizeof(pk));
- pk_tmp[0] = ~pk_tmp[0];
- SIKE_decaps(ss_dec, ct, pk_tmp, sk);
- EXPECT_NE(memcmp(ss_enc, ss_dec, SIKEp503_SS_BYTESZ), 0);
+ // Change public key
+ uint8_t pk_tmp[SIKE_PUB_BYTESZ] = {0};
+ memcpy(pk_tmp, pk, sizeof(pk));
+ pk_tmp[0] = ~pk_tmp[0];
+ SIKE_decaps(ss_dec, ct, pk_tmp, sk);
+ EXPECT_NE(memcmp(ss_enc, ss_dec, SIKE_SS_BYTESZ), 0);
}
TEST(SIKE, Unaligned) {
- alignas(4) uint8_t priv[SIKEp503_PRV_BYTESZ + 1];
- alignas(4) uint8_t pub[SIKEp503_PUB_BYTESZ + 1];
- alignas(4) uint8_t shared_key1[SIKEp503_SS_BYTESZ + 1];
- alignas(4) uint8_t ciphertext[SIKEp503_CT_BYTESZ + 1];
- alignas(4) uint8_t shared_key2[SIKEp503_SS_BYTESZ + 1];
+ alignas(4) uint8_t priv[SIKE_PRV_BYTESZ + 1];
+ alignas(4) uint8_t pub[SIKE_PUB_BYTESZ + 1];
+ alignas(4) uint8_t shared_key1[SIKE_SS_BYTESZ + 1];
+ alignas(4) uint8_t ciphertext[SIKE_CT_BYTESZ + 1];
+ alignas(4) uint8_t shared_key2[SIKE_SS_BYTESZ + 1];
ASSERT_TRUE(SIKE_keypair(priv + 1, pub + 1));
SIKE_encaps(shared_key1 + 1, ciphertext + 1, pub + 1);
SIKE_decaps(shared_key2 + 1, ciphertext + 1, pub + 1, priv + 1);
- EXPECT_EQ(memcmp(shared_key1 + 1, shared_key2 + 1, SIKEp503_SS_BYTESZ), 0);
+ EXPECT_EQ(memcmp(shared_key1 + 1, shared_key2 + 1, SIKE_SS_BYTESZ), 0);
}
-#if defined(SUPPORTS_ABI_TEST) && (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64))
+#if defined(SUPPORTS_ABI_TEST) && \
+ (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64))
TEST(SIKE, ABI) {
felm_t a, b, c;
dfelm_t d, e, f;
@@ -219,4 +206,46 @@
CHECK_ABI(sike_mpsubx2_asm, d, e, f);
CHECK_ABI(sike_mpdblsubx2_asm, d, e, f);
}
+
+// Additional tests for checking if assembly implementation
+// of MUL and REDC handles carry chains correctly.
+TEST(SIKE, CarryChains) {
+ // Expected results
+ const dfelm_t exp_mul = {
+ 0x0000000000000001, 0x0000000000000000, 0x0000000000000000,
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+ 0x0000000000000000, 0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF,
+ 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF,
+ 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF,
+ };
+
+ const felm_t exp_redc = {
+ 0x93AA0C8C2D3235BE, 0xA8CD35DDDE399B46, 0xB9BBA5469509CA65,
+ 0x6B2FB3A5A2FB86E4, 0x585591BA6DBE862C, 0xD92D3FF5FE0938F2,
+ 0x0001E1F0EE75A1E1
+ };
+
+ // Input
+ dfelm_t in14 = {
+ 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF,
+ 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF,
+ 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF,
+ 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF,
+ 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
+ };
+
+ felm_t in7 = {
+ 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF,
+ 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF,
+ 0xFFFFFFFFFFFFFFFF
+ };
+
+ dfelm_t res;
+ sike_mpmul(in7, in7, res);
+ EXPECT_EQ(memcmp(exp_mul, res, sizeof(exp_mul)), 0);
+
+ // modifies in14 and in7
+ sike_fprdc(in14, in7);
+ EXPECT_EQ(memcmp(exp_redc, in7, sizeof(exp_redc)), 0);
+}
#endif // SUPPORTS_ABI_TEST && (X86_64 || AARCH64)
diff --git a/third_party/sike/utils.h b/third_party/sike/utils.h
index 74c640a..cbc8329 100644
--- a/third_party/sike/utils.h
+++ b/third_party/sike/utils.h
@@ -1,7 +1,7 @@
/********************************************************************************************
* SIDH: an efficient supersingular isogeny cryptography library
*
-* Abstract: internal header file for P503
+* Abstract: internal header file for P434
*********************************************************************************************/
#ifndef UTILS_H_
@@ -16,33 +16,33 @@
#define BITS_TO_BYTES(nbits) (((nbits)+7)/8)
// Bit size of the field
-#define BITS_FIELD 503
+#define BITS_FIELD 434
// Byte size of the field
#define FIELD_BYTESZ BITS_TO_BYTES(BITS_FIELD)
-// Number of 64-bit words of a 256-bit element
-#define NBITS_ORDER 256
+// Number of 64-bit words of a 224-bit element
+#define NBITS_ORDER 224
#define NWORDS64_ORDER ((NBITS_ORDER+63)/64)
// Number of elements in Alice's strategy
-#define A_max 125
+#define A_max 108
// Number of elements in Bob's strategy
-#define B_max 159
+#define B_max 137
// Word size size
#define RADIX sizeof(crypto_word_t)*8
// Byte size of a limb
#define LSZ sizeof(crypto_word_t)
#if defined(OPENSSL_64_BIT)
- // Number of words of a 503-bit field element
- #define NWORDS_FIELD 8
- // Number of "0" digits in the least significant part of p503 + 1
- #define p503_ZERO_WORDS 3
+ // Number of words of a 434-bit field element
+ #define NWORDS_FIELD 7
+ // Number of "0" digits in the least significant part of p434 + 1
+ #define ZERO_WORDS 3
// U64_TO_WORDS expands |x| for a |crypto_word_t| array literal.
#define U64_TO_WORDS(x) UINT64_C(x)
#else
- // Number of words of a 503-bit field element
- #define NWORDS_FIELD 16
- // Number of "0" digits in the least significant part of p503 + 1
- #define p503_ZERO_WORDS 7
+ // Number of words of a 434-bit field element
+ #define NWORDS_FIELD 14
+ // Number of "0" digits in the least significant part of p434 + 1
+ #define ZERO_WORDS 6
// U64_TO_WORDS expands |x| for a |crypto_word_t| array literal.
#define U64_TO_WORDS(x) \
(uint32_t)(UINT64_C(x) & 0xffffffff), (uint32_t)(UINT64_C(x) >> 32)
@@ -88,15 +88,15 @@
#define F2ELM_INIT {{ {0}, {0} }}
#define POINT_PROJ_INIT {{ F2ELM_INIT, F2ELM_INIT }}
-// Datatype for representing 503-bit field elements (512-bit max.)
-// Elements over GF(p503) are encoded in 63 octets in little endian format
+// Datatype for representing 434-bit field elements (448-bit max.)
+// Elements over GF(p434) are encoded in 63 octets in little endian format
// (i.e., the least significant octet is located in the lowest memory address).
typedef crypto_word_t felm_t[NWORDS_FIELD];
// An element in F_{p^2}, is composed of two coefficients from F_p, * i.e.
// Fp2 element = c0 + c1*i in F_{p^2}
-// Datatype for representing double-precision 2x503-bit field elements (512-bit max.)
-// Elements (a+b*i) over GF(p503^2), where a and b are defined over GF(p503), are
+// Datatype for representing double-precision 2x434-bit field elements (448-bit max.)
+// Elements (a+b*i) over GF(p434^2), where a and b are defined over GF(p434), are
// encoded as {a, b}, with a in the lowest memory portion.
typedef struct {
felm_t c0;
@@ -106,28 +106,30 @@
// Our F_{p^2} element type is a pointer to the struct.
typedef fp2 f2elm_t[1];
-// Datatype for representing double-precision 2x503-bit
+// Datatype for representing double-precision 2x434-bit
// field elements in contiguous memory.
typedef crypto_word_t dfelm_t[2*NWORDS_FIELD];
-// Constants used during SIKEp503 computation.
+// Constants used during SIKE computation.
struct params_t {
- // Stores P503 prime
+ // Stores a prime
const crypto_word_t prime[NWORDS_FIELD];
- // Stores P503 + 1
+ // Stores prime + 1
const crypto_word_t prime_p1[NWORDS_FIELD];
- // Stores P503 * 2
+ // Stores prime * 2
const crypto_word_t prime_x2[NWORDS_FIELD];
- // Alice's generator values {XPA0 + XPA1*i, XQA0, XRA0 + XRA1*i}
- // in GF(p503^2), expressed in Montgomery representation
- const crypto_word_t A_gen[5*NWORDS_FIELD];
- // Bob's generator values {XPB0 + XPB1*i, XQB0, XRB0 + XRB1*i}
- // in GF(p503^2), expressed in Montgomery representation
- const crypto_word_t B_gen[5*NWORDS_FIELD];
- // Montgomery constant mont_R2 = (2^512)^2 mod p503
+ // Alice's generator values {XPA0 + XPA1*i, XQA0 + XQA1*i, XRA0 + XRA1*i}
+ // in GF(prime^2), expressed in Montgomery representation
+ const crypto_word_t A_gen[6*NWORDS_FIELD];
+ // Bob's generator values {XPB0 + XPB1*i, XQB0 + XQB1*i, XRB0 + XRB1*i}
+ // in GF(prime^2), expressed in Montgomery representation
+ const crypto_word_t B_gen[6*NWORDS_FIELD];
+ // Montgomery constant mont_R2 = (2^448)^2 mod prime
const crypto_word_t mont_R2[NWORDS_FIELD];
// Value 'one' in Montgomery representation
const crypto_word_t mont_one[NWORDS_FIELD];
+ // Value '6' in Montgomery representation
+ const crypto_word_t mont_six[NWORDS_FIELD];
// Fixed parameters for isogeny tree computation
const unsigned int A_strat[A_max-1];
const unsigned int B_strat[B_max-1];
diff --git a/tool/speed.cc b/tool/speed.cc
index 47edc75..3929cf6 100644
--- a/tool/speed.cc
+++ b/tool/speed.cc
@@ -296,14 +296,14 @@
return true;
}
-static bool SpeedSIKEP503(const std::string &selected) {
+static bool SpeedSIKEP434(const std::string &selected) {
if (!selected.empty() && selected.find("SIKE") == std::string::npos) {
return true;
}
// speed generation
- uint8_t public_SIKE[SIKEp503_PUB_BYTESZ];
- uint8_t private_SIKE[SIKEp503_PRV_BYTESZ];
- uint8_t ct[SIKEp503_CT_BYTESZ];
+ uint8_t public_SIKE[SIKE_PUB_BYTESZ];
+ uint8_t private_SIKE[SIKE_PRV_BYTESZ];
+ uint8_t ct[SIKE_CT_BYTESZ];
bool res;
{
@@ -312,7 +312,7 @@
[&private_SIKE, &public_SIKE]() -> bool {
return (SIKE_keypair(private_SIKE, public_SIKE) == 1);
});
- results.Print("SIKE/P503 generate");
+ results.Print("SIKE/P434 generate");
}
if (!res) {
@@ -324,11 +324,11 @@
TimeResults results;
TimeFunction(&results,
[&ct, &public_SIKE]() -> bool {
- uint8_t ss[SIKEp503_SS_BYTESZ];
+ uint8_t ss[SIKE_SS_BYTESZ];
SIKE_encaps(ss, ct, public_SIKE);
return true;
});
- results.Print("SIKE/P503 encap");
+ results.Print("SIKE/P434 encap");
}
if (!res) {
@@ -340,11 +340,11 @@
TimeResults results;
TimeFunction(&results,
[&ct, &public_SIKE, &private_SIKE]() -> bool {
- uint8_t ss[SIKEp503_SS_BYTESZ];
+ uint8_t ss[SIKE_SS_BYTESZ];
SIKE_decaps(ss, ct, public_SIKE, private_SIKE);
return true;
});
- results.Print("SIKE/P503 decap");
+ results.Print("SIKE/P434 decap");
}
if (!res) {
@@ -998,7 +998,7 @@
!SpeedECDH(selected) ||
!SpeedECDSA(selected) ||
!Speed25519(selected) ||
- !SpeedSIKEP503(selected) ||
+ !SpeedSIKEP434(selected) ||
!SpeedSPAKE2(selected) ||
!SpeedScrypt(selected) ||
!SpeedRSAKeyGen(selected) ||