Implements SIKE/p434

* CECPQ2b will use SIKE/p434 instead of SIKE/p503
* KEM uses SHA256 instead of HMAC-256
* implements new starting curve: y^2=x^3 + 6x^2 + x
* adds optimized implementation for aarch64
* adds optimized implementation for AMD64
  which do not support MULX/ADOX/ADCX
* syncs the SIKE test code with the NIST Round 2
  specification.
* removes references to field size from variables
  names, tests and defines.

Change-Id: I5359c6c62ad342354c6d337f7ee525158586ec93
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/36704
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/CMakeLists.txt b/crypto/CMakeLists.txt
index 955dd8b..e97a4e1 100644
--- a/crypto/CMakeLists.txt
+++ b/crypto/CMakeLists.txt
@@ -411,7 +411,7 @@
   ../third_party/fiat/curve25519.c
   ../third_party/sike/fpx.c
   ../third_party/sike/isogeny.c
-  ../third_party/sike/P503.c
+  ../third_party/sike/params.c
   ../third_party/sike/sike.c
   ../third_party/sike/asm/fp_generic.c
 
diff --git a/ssl/ssl_key_share.cc b/ssl/ssl_key_share.cc
index 2bf177b..826fb1a 100644
--- a/ssl/ssl_key_share.cc
+++ b/ssl/ssl_key_share.cc
@@ -319,18 +319,18 @@
               uint8_t *out_alert, Span<const uint8_t> peer_key) override {
     uint8_t public_x25519[32];
     uint8_t private_x25519[32];
-    uint8_t sike_ciphertext[SIKEp503_CT_BYTESZ] = {0};
+    uint8_t sike_ciphertext[SIKE_CT_BYTESZ] = {0};
 
     *out_alert = SSL_AD_INTERNAL_ERROR;
 
-    if (peer_key.size() != sizeof(public_x25519) + SIKEp503_PUB_BYTESZ) {
+    if (peer_key.size() != sizeof(public_x25519) + SIKE_PUB_BYTESZ) {
       *out_alert = SSL_AD_DECODE_ERROR;
       OPENSSL_PUT_ERROR(SSL, SSL_R_BAD_ECPOINT);
       return false;
     }
 
     Array<uint8_t> secret;
-    if (!secret.Init(sizeof(private_x25519_) + SIKEp503_SS_BYTESZ)) {
+    if (!secret.Init(sizeof(private_x25519_) + SIKE_SS_BYTESZ)) {
       OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE);
       return false;
     }
@@ -357,12 +357,12 @@
     *out_alert = SSL_AD_INTERNAL_ERROR;
 
     Array<uint8_t> secret;
-    if (!secret.Init(sizeof(private_x25519_) + SIKEp503_SS_BYTESZ)) {
+    if (!secret.Init(sizeof(private_x25519_) + SIKE_SS_BYTESZ)) {
       OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE);
       return false;
     }
 
-    if (peer_key.size() != 32 + SIKEp503_CT_BYTESZ ||
+    if (peer_key.size() != 32 + SIKE_CT_BYTESZ ||
         !X25519(secret.data(), private_x25519_, peer_key.data())) {
       *out_alert = SSL_AD_DECODE_ERROR;
       OPENSSL_PUT_ERROR(SSL, SSL_R_BAD_ECPOINT);
@@ -377,8 +377,8 @@
 
  private:
   uint8_t private_x25519_[32];
-  uint8_t private_sike_[SIKEp503_PRV_BYTESZ];
-  uint8_t public_sike_[SIKEp503_PUB_BYTESZ];
+  uint8_t private_sike_[SIKE_PRV_BYTESZ];
+  uint8_t public_sike_[SIKE_PUB_BYTESZ];
 };
 
 CONSTEXPR_ARRAY NamedGroup kNamedGroups[] = {
diff --git a/ssl/test/runner/key_agreement.go b/ssl/test/runner/key_agreement.go
index 0d6405f..f4789b6 100644
--- a/ssl/test/runner/key_agreement.go
+++ b/ssl/test/runner/key_agreement.go
@@ -434,7 +434,7 @@
 	return preMasterSecret, nil
 }
 
-// cecpq2BCurve implements CECPQ2b, which is SIKEp503 combined with X25519.
+// cecpq2BCurve implements CECPQ2b, which is SIKE combined with X25519.
 type cecpq2BCurve struct {
 	// Both public key and shared secret size
 	x25519PrivateKey [32]byte
diff --git a/ssl/test/runner/sike/arith.go b/ssl/test/runner/sike/arith.go
index 338a714..10a2ca6 100644
--- a/ssl/test/runner/sike/arith.go
+++ b/ssl/test/runner/sike/arith.go
@@ -22,22 +22,22 @@
 func fpAddRdc(z, x, y *Fp) {
 	var carry uint64
 
-	// z=x+y % p503
+	// z=x+y % p
 	for i := 0; i < FP_WORDS; i++ {
 		z[i], carry = bits.Add64(x[i], y[i], carry)
 	}
 
-	// z = z - p503x2
+	// z = z - pX2
 	carry = 0
 	for i := 0; i < FP_WORDS; i++ {
-		z[i], carry = bits.Sub64(z[i], p503x2[i], carry)
+		z[i], carry = bits.Sub64(z[i], pX2[i], carry)
 	}
 
-	// if z<0 add p503x2 back
+	// if z<0 add pX2 back
 	mask := uint64(0 - carry)
 	carry = 0
 	for i := 0; i < FP_WORDS; i++ {
-		z[i], carry = bits.Add64(z[i], p503x2[i]&mask, carry)
+		z[i], carry = bits.Add64(z[i], pX2[i]&mask, carry)
 	}
 }
 
@@ -45,16 +45,16 @@
 func fpSubRdc(z, x, y *Fp) {
 	var borrow uint64
 
-	// z = z - p503x2
+	// z = z - pX2
 	for i := 0; i < FP_WORDS; i++ {
 		z[i], borrow = bits.Sub64(x[i], y[i], borrow)
 	}
 
-	// if z<0 add p503x2 back
+	// if z<0 add pX2 back
 	mask := uint64(0 - borrow)
 	borrow = 0
 	for i := 0; i < FP_WORDS; i++ {
-		z[i], borrow = bits.Add64(z[i], p503x2[i]&mask, borrow)
+		z[i], borrow = bits.Add64(z[i], pX2[i]&mask, borrow)
 	}
 }
 
@@ -62,14 +62,14 @@
 func fpRdcP(x *Fp) {
 	var borrow, mask uint64
 	for i := 0; i < FP_WORDS; i++ {
-		x[i], borrow = bits.Sub64(x[i], p503[i], borrow)
+		x[i], borrow = bits.Sub64(x[i], p[i], borrow)
 	}
 
 	// Sets all bits if borrow = 1
 	mask = 0 - borrow
 	borrow = 0
 	for i := 0; i < FP_WORDS; i++ {
-		x[i], borrow = bits.Add64(x[i], p503[i]&mask, borrow)
+		x[i], borrow = bits.Add64(x[i], p[i]&mask, borrow)
 	}
 }
 
@@ -123,12 +123,12 @@
 	var hi, lo uint64
 	var count int
 
-	count = 3 // number of 0 digits in the least significat part of p503 + 1
+	count = 3 // number of 0 digits in the least significat part of p + 1
 
 	for i := 0; i < FP_WORDS; i++ {
 		for j := 0; j < i; j++ {
 			if j < (i - count + 1) {
-				hi, lo = bits.Mul64(z[j], p503p1[i-j])
+				hi, lo = bits.Mul64(z[j], p1[i-j])
 				v, carry = bits.Add64(lo, v, 0)
 				u, carry = bits.Add64(hi, u, carry)
 				t += carry
@@ -150,7 +150,7 @@
 		}
 		for j := i - FP_WORDS + 1; j < FP_WORDS; j++ {
 			if j < (FP_WORDS - count) {
-				hi, lo = bits.Mul64(z[j], p503p1[i-j])
+				hi, lo = bits.Mul64(z[j], p1[i-j])
 				v, carry = bits.Add64(lo, v, 0)
 				u, carry = bits.Add64(hi, u, carry)
 				t += carry
@@ -188,7 +188,7 @@
 	mask = 0 - borrow
 	borrow = 0
 	for i := FP_WORDS; i < 2*FP_WORDS; i++ {
-		z[i], borrow = bits.Add64(z[i], p503[i-FP_WORDS]&mask, borrow)
+		z[i], borrow = bits.Add64(z[i], p[i-FP_WORDS]&mask, borrow)
 	}
 }
 
@@ -210,25 +210,34 @@
 //
 // Allowed to overlap x with dest.
 // All values in Montgomery domains
+// Set dest = x^(2^k), for k >= 1, by repeated squarings.
 func p34(dest, x *Fp) {
+	var lookup [16]Fp
 
-	// Set dest = x^(2^k), for k >= 1, by repeated squarings.
-	pow2k := func(dest, x *Fp, k uint8) {
-		fpMulRdc(dest, x, x)
-		for i := uint8(1); i < k; i++ {
-			fpMulRdc(dest, dest, dest)
-		}
-	}
-	// Sliding-window strategy computed with etc/scripts/sliding_window_strat_calc.py
-	//
 	// This performs sum(powStrategy) + 1 squarings and len(lookup) + len(mulStrategy)
 	// multiplications.
-	powStrategy := []uint8{1, 12, 5, 5, 2, 7, 11, 3, 8, 4, 11, 4, 7, 5, 6, 3, 7, 5, 7, 2, 12, 5, 6, 4, 6, 8, 6, 4, 7, 5, 5, 8, 5, 8, 5, 5, 8, 9, 3, 6, 2, 10, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3}
-	mulStrategy := []uint8{0, 12, 11, 10, 0, 1, 8, 3, 7, 1, 8, 3, 6, 7, 14, 2, 14, 14, 9, 0, 13, 9, 15, 5, 12, 7, 13, 7, 15, 6, 7, 9, 0, 5, 7, 6, 8, 8, 3, 7, 0, 10, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 3}
+	powStrategy := []uint8{
+		0x03, 0x0A, 0x07, 0x05, 0x06, 0x05, 0x03, 0x08, 0x04, 0x07,
+		0x05, 0x06, 0x04, 0x05, 0x09, 0x06, 0x03, 0x0B, 0x05, 0x05,
+		0x02, 0x08, 0x04, 0x07, 0x07, 0x08, 0x05, 0x06, 0x04, 0x08,
+		0x05, 0x02, 0x0A, 0x06, 0x05, 0x04, 0x08, 0x05, 0x05, 0x05,
+		0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
+		0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
+		0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
+		0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x01}
+	mulStrategy := []uint8{
+		0x02, 0x0F, 0x09, 0x08, 0x0E, 0x0C, 0x02, 0x08, 0x05, 0x0F,
+		0x08, 0x0F, 0x06, 0x06, 0x03, 0x02, 0x00, 0x0A, 0x09, 0x0D,
+		0x01, 0x0C, 0x03, 0x07, 0x01, 0x0A, 0x08, 0x0B, 0x02, 0x0F,
+		0x0E, 0x01, 0x0B, 0x0C, 0x0E, 0x03, 0x0B, 0x0F, 0x0F, 0x0F,
+		0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
+		0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
+		0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
+		0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x00}
+	initialMul := uint8(8)
 
 	// Precompute lookup table of odd multiples of x for window
 	// size k=5.
-	lookup := [16]Fp{}
 	var xx Fp
 	fpMulRdc(&xx, x, x)
 	lookup[0] = *x
@@ -239,9 +248,12 @@
 	// Now lookup = {x, x^3, x^5, ... }
 	// so that lookup[i] = x^{2*i + 1}
 	// so that lookup[k/2] = x^k, for odd k
-	*dest = lookup[mulStrategy[0]]
-	for i := uint8(1); i < uint8(len(powStrategy)); i++ {
-		pow2k(dest, dest, powStrategy[i])
+	*dest = lookup[initialMul]
+	for i := uint8(0); i < uint8(len(powStrategy)); i++ {
+		fpMulRdc(dest, dest, dest)
+		for j := uint8(1); j < powStrategy[i]; j++ {
+			fpMulRdc(dest, dest, dest)
+		}
 		fpMulRdc(dest, dest, &lookup[mulStrategy[i]])
 	}
 }
diff --git a/ssl/test/runner/sike/consts.go b/ssl/test/runner/sike/consts.go
index 0ecff52..9d68a4f 100644
--- a/ssl/test/runner/sike/consts.go
+++ b/ssl/test/runner/sike/consts.go
@@ -52,17 +52,21 @@
 	PublicKeySize int
 	// The shared secret size, in bytes.
 	SharedSecretSize int
+	// Defines A,C constant for starting curve Cy^2 = x^3 + Ax^2 + x
+	InitCurve ProjectiveCurveParameters
 	// 2- and 3-torsion group parameter definitions
 	A, B DomainParams
-	// Precomputed identity element in the Fp2 in Montgomery domain
-	OneFp2 Fp2
 	// Precomputed 1/2 in the Fp2 in Montgomery domain
 	HalfFp2 Fp2
+	// Precomputed identity element in the Fp2 in Montgomery domain
+	OneFp2 Fp2
 	// Length of SIKE secret message. Must be one of {24,32,40},
 	// depending on size of prime field used (see [SIKE], 1.4 and 5.1)
 	MsgLen int
 	// Length of SIKE ephemeral KEM key (see [SIKE], 1.4 and 5.1)
 	KemSize int
+	// Size of a ciphertext returned by encapsulation in bytes
+	CiphertextSize int
 }
 
 // Stores curve projective parameters equivalent to A/C. Meaning of the
@@ -130,172 +134,184 @@
 	// 110 - SIKE
 	KeyVariant_SIKE = 1<<2 | KeyVariant_SIDH_B
 	// Number of uint64 limbs used to store field element
-	FP_WORDS = 8
+	FP_WORDS = 7
 )
 
 // Used internally by this package
 // -------------------------------
 
-var p503 = Fp{
-	0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xABFFFFFFFFFFFFFF,
-	0x13085BDA2211E7A0, 0x1B9BF6C87B7E7DAF, 0x6045C6BDDA77A4D0, 0x004066F541811E1E,
-}
+var (
+	p = Fp{
+		0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFDC1767AE2FFFFFF,
+		0x7BC65C783158AEA3, 0x6CFC5FD681C52056, 0x2341F27177344,
+	}
 
-// 2*503
-var p503x2 = Fp{
-	0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x57FFFFFFFFFFFFFF,
-	0x2610B7B44423CF41, 0x3737ED90F6FCFB5E, 0xC08B8D7BB4EF49A0, 0x0080CDEA83023C3C,
-}
+	// 2*p434
+	pX2 = Fp{
+		0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFB82ECF5C5FFFFFF,
+		0xF78CB8F062B15D47, 0xD9F8BFAD038A40AC, 0x4683E4E2EE688,
+	}
 
-// p503 + 1
-var p503p1 = Fp{
-	0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xAC00000000000000,
-	0x13085BDA2211E7A0, 0x1B9BF6C87B7E7DAF, 0x6045C6BDDA77A4D0, 0x004066F541811E1E,
-}
+	// p434 + 1
+	p1 = Fp{
+		0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFDC1767AE3000000,
+		0x7BC65C783158AEA3, 0x6CFC5FD681C52056, 0x0002341F27177344,
+	}
 
-// R^2=(2^512)^2 mod p
-var p503R2 = Fp{
-	0x5289A0CF641D011F, 0x9B88257189FED2B9, 0xA3B365D58DC8F17A, 0x5BC57AB6EFF168EC,
-	0x9E51998BD84D4423, 0xBF8999CBAC3B5695, 0x46E9127BCE14CDB6, 0x003F6CFCE8B81771,
-}
+	// R^2=(2^448)^2 mod p
+	R2 = Fp{
+		0x28E55B65DCD69B30, 0xACEC7367768798C2, 0xAB27973F8311688D, 0x175CC6AF8D6C7C0B,
+		0xABCD92BF2DDE347E, 0x69E16A61C7686D9A, 0x000025A89BCDD12A,
+	}
 
-// p503 + 1 left-shifted by 8, assuming little endianness
-var p503p1s8 = Fp{
-	0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
-	0x085BDA2211E7A0AC, 0x9BF6C87B7E7DAF13, 0x45C6BDDA77A4D01B, 0x4066F541811E1E60,
-}
+	// 1/2 * R mod p
+	half = Fp2{
+		A: Fp{
+			0x0000000000003A16, 0x0000000000000000, 0x0000000000000000, 0x5C87FA027E000000,
+			0x6C00D27DAACFD66A, 0x74992A2A2FBBA086, 0x0000767753DE976D},
+	}
 
-// 1*R mod p
-var P503_OneFp2 = Fp2{
-	A: Fp{
-		0x00000000000003F9, 0x0000000000000000, 0x0000000000000000, 0xB400000000000000,
-		0x63CB1A6EA6DED2B4, 0x51689D8D667EB37D, 0x8ACD77C71AB24142, 0x0026FBAEC60F5953},
-}
+	// 1*R mod p
+	one = Fp2{
+		A: Fp{
+			0x000000000000742C, 0x0000000000000000, 0x0000000000000000, 0xB90FF404FC000000,
+			0xD801A4FB559FACD4, 0xE93254545F77410C, 0x0000ECEEA7BD2EDA},
+	}
 
-// 1/2 * R mod p
-var P503_HalfFp2 = Fp2{
-	A: Fp{
-		0x00000000000001FC, 0x0000000000000000, 0x0000000000000000, 0xB000000000000000,
-		0x3B69BB2464785D2A, 0x36824A2AF0FE9896, 0xF5899F427A94F309, 0x0033B15203C83BB8},
-}
+	// 6*R mod p
+	six = Fp2{
+		A: Fp{
+			0x000000000002B90A, 0x0000000000000000, 0x0000000000000000, 0x5ADCCB2822000000,
+			0x187D24F39F0CAFB4, 0x9D353A4D394145A0, 0x00012559A0403298},
+	}
 
-var Params SidhParams
+	Params SidhParams
+)
 
 func init() {
 	Params = SidhParams{
 		// SIDH public key byte size.
-		PublicKeySize: 378,
+		PublicKeySize: 330,
 		// SIDH shared secret byte size.
-		SharedSecretSize: 126,
+		SharedSecretSize: 110,
+		InitCurve: ProjectiveCurveParameters{
+			A: six,
+			C: one,
+		},
 		A: DomainParams{
 			// The x-coordinate of PA
 			Affine_P: Fp2{
 				A: Fp{
-					0xE7EF4AA786D855AF, 0xED5758F03EB34D3B, 0x09AE172535A86AA9, 0x237B9CC07D622723,
-					0xE3A284CBA4E7932D, 0x27481D9176C5E63F, 0x6A323FF55C6E71BF, 0x002ECC31A6FB8773,
+					0x05ADF455C5C345BF, 0x91935C5CC767AC2B, 0xAFE4E879951F0257, 0x70E792DC89FA27B1,
+					0xF797F526BB48C8CD, 0x2181DB6131AF621F, 0x00000A1C08B1ECC4,
 				},
 				B: Fp{
-					0x64D02E4E90A620B8, 0xDAB8128537D4B9F1, 0x4BADF77B8A228F98, 0x0F5DBDF9D1FB7D1B,
-					0xBEC4DB288E1A0DCC, 0xE76A8665E80675DB, 0x6D6F252E12929463, 0x003188BD1463FACC,
+					0x74840EB87CDA7788, 0x2971AA0ECF9F9D0B, 0xCB5732BDF41715D5, 0x8CD8E51F7AACFFAA,
+					0xA7F424730D7E419F, 0xD671EB919A179E8C, 0x0000FFA26C5A924A,
 				},
 			},
 			// The x-coordinate of QA
 			Affine_Q: Fp2{
 				A: Fp{
-					0xB79D41025DE85D56, 0x0B867DA9DF169686, 0x740E5368021C827D, 0x20615D72157BF25C,
-					0xFF1590013C9B9F5B, 0xC884DCADE8C16CEA, 0xEBD05E53BF724E01, 0x0032FEF8FDA5748C,
+					0xFEC6E64588B7273B, 0xD2A626D74CBBF1C6, 0xF8F58F07A78098C7, 0xE23941F470841B03,
+					0x1B63EDA2045538DD, 0x735CFEB0FFD49215, 0x0001C4CB77542876,
 				},
 				B: Fp{
-					0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
-					0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+					0xADB0F733C17FFDD6, 0x6AFFBD037DA0A050, 0x680EC43DB144E02F, 0x1E2E5D5FF524E374,
+					0xE2DDA115260E2995, 0xA6E4B552E2EDE508, 0x00018ECCDDF4B53E,
 				},
 			},
 			// The x-coordinate of RA = PA-QA
 			Affine_R: Fp2{
 				A: Fp{
-					0x12E2E849AA0A8006, 0x41CF47008635A1E8, 0x9CD720A70798AED7, 0x42A820B42FCF04CF,
-					0x7BF9BAD32AAE88B1, 0xF619127A54090BBE, 0x1CB10D8F56408EAA, 0x001D6B54C3C0EDEB,
+					0x01BA4DB518CD6C7D, 0x2CB0251FE3CC0611, 0x259B0C6949A9121B, 0x60E17AC16D2F82AD,
+					0x3AA41F1CE175D92D, 0x413FBE6A9B9BC4F3, 0x00022A81D8D55643,
 				},
 				B: Fp{
-					0x34DB54931CBAAC36, 0x420A18CB8DD5F0C4, 0x32008C1A48C0F44D, 0x3B3BA772B1CFD44D,
-					0xA74B058FDAF13515, 0x095FC9CA7EEC17B4, 0x448E829D28F120F8, 0x00261EC3ED16A489,
+					0xB8ADBC70FC82E54A, 0xEF9CDDB0D5FADDED, 0x5820C734C80096A0, 0x7799994BAA96E0E4,
+					0x044961599E379AF8, 0xDB2B94FBF09F27E2, 0x0000B87FC716C0C6,
 				},
 			},
 			// Max size of secret key for 2-torsion group, corresponds to 2^e2 - 1
-			SecretBitLen: 250,
+			SecretBitLen: 216,
 			// SecretBitLen in bytes.
-			SecretByteLen: uint((250 + 7) / 8),
+			SecretByteLen: 27,
 			// 2-torsion group computation strategy
 			IsogenyStrategy: []uint32{
-				0x3D, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01,
-				0x01, 0x02, 0x01, 0x01, 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02,
-				0x01, 0x01, 0x02, 0x01, 0x01, 0x10, 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01,
-				0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01,
-				0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x1D, 0x10, 0x08, 0x04, 0x02, 0x01,
-				0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x08, 0x04, 0x02,
-				0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x0D, 0x08,
-				0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01,
-				0x05, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x01},
+				0x30, 0x1C, 0x10, 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01,
+				0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x08, 0x04,
+				0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01,
+				0x02, 0x01, 0x01, 0x0D, 0x07, 0x04, 0x02, 0x01, 0x01, 0x02,
+				0x01, 0x01, 0x03, 0x02, 0x01, 0x01, 0x01, 0x01, 0x05, 0x04,
+				0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x01,
+				0x15, 0x0C, 0x07, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01,
+				0x03, 0x02, 0x01, 0x01, 0x01, 0x01, 0x05, 0x03, 0x02, 0x01,
+				0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x01, 0x09, 0x05, 0x03,
+				0x02, 0x01, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x01, 0x04,
+				0x02, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01},
 		},
 		B: DomainParams{
 			// The x-coordinate of PB
 			Affine_P: Fp2{
 				A: Fp{
-					0x7EDE37F4FA0BC727, 0xF7F8EC5C8598941C, 0xD15519B516B5F5C8, 0xF6D5AC9B87A36282,
-					0x7B19F105B30E952E, 0x13BD8B2025B4EBEE, 0x7B96D27F4EC579A2, 0x00140850CAB7E5DE,
+					0x6E5497556EDD48A3, 0x2A61B501546F1C05, 0xEB919446D049887D, 0x5864A4A69D450C4F,
+					0xB883F276A6490D2B, 0x22CC287022D5F5B9, 0x0001BED4772E551F,
 				},
 				B: Fp{
-					0x7764909DAE7B7B2D, 0x578ABB16284911AB, 0x76E2BFD146A6BF4D, 0x4824044B23AA02F0,
-					0x1105048912A321F3, 0xB8A2E482CF0F10C1, 0x42FF7D0BE2152085, 0x0018E599C5223352,
+					0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+					0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
 				},
 			},
 			// The x-coordinate of QB
 			Affine_Q: Fp2{
 				A: Fp{
-					0x4256C520FB388820, 0x744FD7C3BAAF0A13, 0x4B6A2DDDB12CBCB8, 0xE46826E27F427DF8,
-					0xFE4A663CD505A61B, 0xD6B3A1BAF025C695, 0x7C3BB62B8FCC00BD, 0x003AFDDE4A35746C,
+					0xFAE2A3F93D8B6B8E, 0x494871F51700FE1C, 0xEF1A94228413C27C, 0x498FF4A4AF60BD62,
+					0xB00AD2A708267E8A, 0xF4328294E017837F, 0x000034080181D8AE,
 				},
 				B: Fp{
 					0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
-					0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+					0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
 				},
 			},
 			// The x-coordinate of RB = PB - QB
 			Affine_R: Fp2{
 				A: Fp{
-					0x75601CD1E6C0DFCB, 0x1A9007239B58F93E, 0xC1F1BE80C62107AC, 0x7F513B898F29FF08,
-					0xEA0BEDFF43E1F7B2, 0x2C6D94018CBAE6D0, 0x3A430D31BCD84672, 0x000D26892ECCFE83,
+					0x283B34FAFEFDC8E4, 0x9208F44977C3E647, 0x7DEAE962816F4E9A, 0x68A2BA8AA262EC9D,
+					0x8176F112EA43F45B, 0x02106D022634F504, 0x00007E8A50F02E37,
 				},
 				B: Fp{
-					0x1119D62AEA3007A1, 0xE3702AA4E04BAE1B, 0x9AB96F7D59F990E7, 0xF58440E8B43319C0,
-					0xAF8134BEE1489775, 0xE7F7774E905192AA, 0xF54AE09308E98039, 0x001EF7A041A86112,
+					0xB378B7C1DA22CCB1, 0x6D089C99AD1D9230, 0xEBE15711813E2369, 0x2B35A68239D48A53,
+					0x445F6FD138407C93, 0xBEF93B29A3F6B54B, 0x000173FA910377D3,
 				},
 			},
 			// Size of secret key for 3-torsion group, corresponds to log_2(3^e3) - 1.
-			SecretBitLen: 252,
+			SecretBitLen: 217,
 			// SecretBitLen in bytes.
-			SecretByteLen: uint((252 + 7) / 8),
+			SecretByteLen: 28,
 			// 3-torsion group computation strategy
 			IsogenyStrategy: []uint32{
-				0x47, 0x26, 0x15, 0x0D, 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02,
-				0x01, 0x01, 0x02, 0x01, 0x01, 0x05, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x02,
-				0x01, 0x01, 0x01, 0x09, 0x05, 0x03, 0x02, 0x01, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01,
-				0x01, 0x04, 0x02, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x11, 0x09, 0x05, 0x03, 0x02,
-				0x01, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x01, 0x02,
-				0x01, 0x01, 0x08, 0x04, 0x02, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01,
-				0x01, 0x02, 0x01, 0x01, 0x21, 0x11, 0x09, 0x05, 0x03, 0x02, 0x01, 0x01, 0x01, 0x01,
-				0x02, 0x01, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x08, 0x04,
-				0x02, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01,
-				0x10, 0x08, 0x04, 0x02, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01,
-				0x02, 0x01, 0x01, 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01,
-				0x01, 0x02, 0x01, 0x01},
+				0x42, 0x21, 0x11, 0x09, 0x05, 0x03, 0x02, 0x01, 0x01, 0x01,
+				0x01, 0x02, 0x01, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x01,
+				0x02, 0x01, 0x01, 0x08, 0x04, 0x02, 0x01, 0x01, 0x01, 0x02,
+				0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x10,
+				0x08, 0x04, 0x02, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04,
+				0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x08, 0x04, 0x02, 0x01,
+				0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01,
+				0x01, 0x20, 0x10, 0x08, 0x04, 0x03, 0x01, 0x01, 0x01, 0x01,
+				0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01,
+				0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02,
+				0x01, 0x01, 0x02, 0x01, 0x01, 0x10, 0x08, 0x04, 0x02, 0x01,
+				0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01,
+				0x01, 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04,
+				0x02, 0x01, 0x01, 0x02, 0x01, 0x01},
 		},
-		OneFp2:  P503_OneFp2,
-		HalfFp2: P503_HalfFp2,
-		MsgLen:  24,
-		// SIKEp503 provides 128 bit of classical security ([SIKE], 5.1)
+		OneFp2:  one,
+		HalfFp2: half,
+		MsgLen:  16,
+		// SIKEp434 provides 128 bit of classical security ([SIKE], 5.1)
 		KemSize: 16,
-		// ceil(503+7/8)
-		Bytelen: 63,
+		// ceil(434+7/8)
+		Bytelen:        55,
+		CiphertextSize: 16 + 330,
 	}
 }
diff --git a/ssl/test/runner/sike/curve.go b/ssl/test/runner/sike/curve.go
index 69febaf..8172546 100644
--- a/ssl/test/runner/sike/curve.go
+++ b/ssl/test/runner/sike/curve.go
@@ -72,9 +72,9 @@
 	var aRR FpX2
 
 	// convert to montgomery domain
-	fpMul(&aRR, &x.A, &p503R2) // = a*R*R
-	fpMontRdc(&x.A, &aRR)      // = a*R mod p
-	fpMul(&aRR, &x.B, &p503R2)
+	fpMul(&aRR, &x.A, &R2) // = a*R*R
+	fpMontRdc(&x.A, &aRR)  // = a*R mod p
+	fpMul(&aRR, &x.B, &R2)
 	fpMontRdc(&x.B, &aRR)
 }
 
diff --git a/ssl/test/runner/sike/sike.go b/ssl/test/runner/sike/sike.go
index fc6de05..dcd6cfc 100644
--- a/ssl/test/runner/sike/sike.go
+++ b/ssl/test/runner/sike/sike.go
@@ -15,27 +15,12 @@
 package sike
 
 import (
-	"crypto/hmac"
 	"crypto/sha256"
 	"crypto/subtle"
 	"errors"
 	"io"
 )
 
-// Constants used for cSHAKE customization
-// Those values are different than in [SIKE] - they are encoded on 16bits. This is
-// done in order for implementation to be compatible with [REF] and test vectors.
-var G = []byte{0x00, 0x00}
-var H = []byte{0x01, 0x00}
-var F = []byte{0x02, 0x00}
-
-// Generates HMAC-SHA256 sum
-func hashMac(out, in, S []byte) {
-	h := hmac.New(sha256.New, in)
-	h.Write(S)
-	copy(out, h.Sum(nil))
-}
-
 // Zeroize Fp2
 func zeroize(fp *Fp2) {
 	// Zeroizing in 2 separated loops tells compiler to
@@ -236,9 +221,8 @@
 // Generate a public key in the 2-torsion group
 func publicKeyGenA(prv *PrivateKey) (pub *PublicKey) {
 	var xPA, xQA, xRA ProjectivePoint
-	var xPB, xQB, xRB, xR ProjectivePoint
+	var xPB, xQB, xRB, xK ProjectivePoint
 	var invZP, invZQ, invZR Fp2
-	var tmp ProjectiveCurveParameters
 
 	pub = NewPublicKey(KeyVariant_SIDH_A)
 	var phi = NewIsogeny4()
@@ -254,16 +238,11 @@
 	xPB = ProjectivePoint{X: prv.params.B.Affine_P, Z: prv.params.OneFp2}
 
 	// Find isogeny kernel
-	tmp.C = pub.params.OneFp2
-	xR = ScalarMul3Pt(&tmp, &xPA, &xQA, &xRA, prv.params.A.SecretBitLen, prv.Scalar)
-
-	// Reset params object and travers isogeny tree
-	tmp.C = pub.params.OneFp2
-	zeroize(&tmp.A)
-	traverseTreePublicKeyA(&tmp, &xR, &xPB, &xQB, &xRB, pub)
+	xK = ScalarMul3Pt(&pub.params.InitCurve, &xPA, &xQA, &xRA, prv.params.A.SecretBitLen, prv.Scalar)
+	traverseTreePublicKeyA(&pub.params.InitCurve, &xK, &xPB, &xQB, &xRB, pub)
 
 	// Secret isogeny
-	phi.GenerateCurve(&xR)
+	phi.GenerateCurve(&xK)
 	xPA = phi.EvaluatePoint(&xPB)
 	xQA = phi.EvaluatePoint(&xQB)
 	xRA = phi.EvaluatePoint(&xRB)
@@ -277,10 +256,9 @@
 
 // Generate a public key in the 3-torsion group
 func publicKeyGenB(prv *PrivateKey) (pub *PublicKey) {
-	var xPB, xQB, xRB, xR ProjectivePoint
+	var xPB, xQB, xRB, xK ProjectivePoint
 	var xPA, xQA, xRA ProjectivePoint
 	var invZP, invZQ, invZR Fp2
-	var tmp ProjectiveCurveParameters
 
 	pub = NewPublicKey(prv.keyVariant)
 	var phi = NewIsogeny3()
@@ -295,14 +273,10 @@
 	xQA = ProjectivePoint{X: prv.params.A.Affine_Q, Z: prv.params.OneFp2}
 	xRA = ProjectivePoint{X: prv.params.A.Affine_R, Z: prv.params.OneFp2}
 
-	tmp.C = pub.params.OneFp2
-	xR = ScalarMul3Pt(&tmp, &xPB, &xQB, &xRB, prv.params.B.SecretBitLen, prv.Scalar)
+	xK = ScalarMul3Pt(&pub.params.InitCurve, &xPB, &xQB, &xRB, prv.params.B.SecretBitLen, prv.Scalar)
+	traverseTreePublicKeyB(&pub.params.InitCurve, &xK, &xPA, &xQA, &xRA, pub)
 
-	tmp.C = pub.params.OneFp2
-	zeroize(&tmp.A)
-	traverseTreePublicKeyB(&tmp, &xR, &xPA, &xQA, &xRA, pub)
-
-	phi.GenerateCurve(&xR)
+	phi.GenerateCurve(&xK)
 	xPB = phi.EvaluatePoint(&xPA)
 	xQB = phi.EvaluatePoint(&xQA)
 	xRB = phi.EvaluatePoint(&xRA)
@@ -321,27 +295,28 @@
 // Establishing shared keys in in 2-torsion group
 func deriveSecretA(prv *PrivateKey, pub *PublicKey) []byte {
 	var sharedSecret = make([]byte, pub.params.SharedSecretSize)
-	var cparam ProjectiveCurveParameters
 	var xP, xQ, xQmP ProjectivePoint
-	var xR ProjectivePoint
+	var xK ProjectivePoint
+	var cparam ProjectiveCurveParameters
 	var phi = NewIsogeny4()
 	var jInv Fp2
 
 	// Recover curve coefficients
-	cparam.C = pub.params.OneFp2
 	RecoverCoordinateA(&cparam, &pub.affine_xP, &pub.affine_xQ, &pub.affine_xQmP)
+	// C=1
+	cparam.C = Params.OneFp2
 
 	// Find kernel of the morphism
 	xP = ProjectivePoint{X: pub.affine_xP, Z: pub.params.OneFp2}
 	xQ = ProjectivePoint{X: pub.affine_xQ, Z: pub.params.OneFp2}
 	xQmP = ProjectivePoint{X: pub.affine_xQmP, Z: pub.params.OneFp2}
-	xR = ScalarMul3Pt(&cparam, &xP, &xQ, &xQmP, pub.params.A.SecretBitLen, prv.Scalar)
+	xK = ScalarMul3Pt(&cparam, &xP, &xQ, &xQmP, pub.params.A.SecretBitLen, prv.Scalar)
 
 	// Traverse isogeny tree
-	traverseTreeSharedKeyA(&cparam, &xR, pub)
+	traverseTreeSharedKeyA(&cparam, &xK, pub)
 
 	// Calculate j-invariant on isogeneus curve
-	c := phi.GenerateCurve(&xR)
+	c := phi.GenerateCurve(&xK)
 	RecoverCurveCoefficients4(&cparam, &c)
 	Jinvariant(&cparam, &jInv)
 	convFp2ToBytes(sharedSecret, &jInv)
@@ -352,26 +327,27 @@
 func deriveSecretB(prv *PrivateKey, pub *PublicKey) []byte {
 	var sharedSecret = make([]byte, pub.params.SharedSecretSize)
 	var xP, xQ, xQmP ProjectivePoint
-	var xR ProjectivePoint
+	var xK ProjectivePoint
 	var cparam ProjectiveCurveParameters
 	var phi = NewIsogeny3()
 	var jInv Fp2
 
-	// Recover curve coefficients
-	cparam.C = pub.params.OneFp2
+	// Recover curve A coefficient
 	RecoverCoordinateA(&cparam, &pub.affine_xP, &pub.affine_xQ, &pub.affine_xQmP)
+	// C=1
+	cparam.C = Params.OneFp2
 
 	// Find kernel of the morphism
 	xP = ProjectivePoint{X: pub.affine_xP, Z: pub.params.OneFp2}
 	xQ = ProjectivePoint{X: pub.affine_xQ, Z: pub.params.OneFp2}
 	xQmP = ProjectivePoint{X: pub.affine_xQmP, Z: pub.params.OneFp2}
-	xR = ScalarMul3Pt(&cparam, &xP, &xQ, &xQmP, pub.params.B.SecretBitLen, prv.Scalar)
+	xK = ScalarMul3Pt(&cparam, &xP, &xQ, &xQmP, pub.params.B.SecretBitLen, prv.Scalar)
 
 	// Traverse isogeny tree
-	traverseTreeSharedKeyB(&cparam, &xR, pub)
+	traverseTreeSharedKeyB(&cparam, &xK, pub)
 
 	// Calculate j-invariant on isogeneus curve
-	c := phi.GenerateCurve(&xR)
+	c := phi.GenerateCurve(&xK)
 	RecoverCurveCoefficients3(&cparam, &c)
 	Jinvariant(&cparam, &jInv)
 	convFp2ToBytes(sharedSecret, &jInv)
@@ -379,9 +355,6 @@
 }
 
 func encrypt(skA *PrivateKey, pkA, pkB *PublicKey, ptext []byte) ([]byte, error) {
-	var n [40]byte // n can is max 320-bit (see 1.4 of [SIKE])
-	var ptextLen = len(ptext)
-
 	if pkB.keyVariant != KeyVariant_SIKE {
 		return nil, errors.New("wrong key type")
 	}
@@ -391,14 +364,19 @@
 		return nil, err
 	}
 
-	hashMac(n[:ptextLen], j, F)
-	for i, _ := range ptext {
-		n[i] ^= ptext[i]
+	if len(ptext) != pkA.params.KemSize {
+		panic("Implementation error")
 	}
 
-	ret := make([]byte, pkA.Size()+ptextLen)
+	digest := sha256.Sum256(j)
+	// Uses truncated digest (first 16-bytes)
+	for i, _ := range ptext {
+		digest[i] ^= ptext[i]
+	}
+
+	ret := make([]byte, pkA.Size()+len(ptext))
 	copy(ret, pkA.Export())
-	copy(ret[pkA.Size():], n[:ptextLen])
+	copy(ret[pkA.Size():], digest[:pkA.params.KemSize])
 	return ret, nil
 }
 
@@ -565,7 +543,7 @@
 func Encrypt(rng io.Reader, pub *PublicKey, ptext []byte) ([]byte, error) {
 	var ptextLen = len(ptext)
 	// c1 must be security level + 64 bits (see [SIKE] 1.4 and 4.3.3)
-	if ptextLen != (pub.params.KemSize + 8) {
+	if ptextLen != pub.params.KemSize {
 		return nil, errors.New("Unsupported message length")
 	}
 
@@ -583,9 +561,9 @@
 // decryption succeeds or error in case unexptected input was provided.
 // Constant time
 func Decrypt(prv *PrivateKey, ctext []byte) ([]byte, error) {
-	var n [40]byte // n can is max 320-bit (see 1.4 of [SIKE])
 	var c1_len int
-	var pk_len = prv.params.PublicKeySize
+	n := make([]byte, prv.params.KemSize)
+	pk_len := prv.params.PublicKeySize
 
 	if prv.keyVariant != KeyVariant_SIKE {
 		return nil, errors.New("wrong key type")
@@ -594,7 +572,7 @@
 	// ctext is a concatenation of (pubkey_A || c1=ciphertext)
 	// it must be security level + 64 bits (see [SIKE] 1.4 and 4.3.3)
 	c1_len = len(ctext) - pk_len
-	if c1_len != (int(prv.params.KemSize) + 8) {
+	if c1_len != int(prv.params.KemSize) {
 		return nil, errors.New("wrong size of cipher text")
 	}
 
@@ -608,8 +586,10 @@
 		return nil, err
 	}
 
-	hashMac(n[:c1_len], j, F)
-	for i, _ := range n[:c1_len] {
+	digest := sha256.Sum256(j)
+	copy(n, digest[:])
+
+	for i, _ := range n {
 		n[i] ^= ctext[pk_len+i]
 	}
 	return n[:c1_len], nil
@@ -621,11 +601,9 @@
 // Error is returned in case PRNG fails or wrongly formatted input was provided.
 func Encapsulate(rng io.Reader, pub *PublicKey) (ctext []byte, secret []byte, err error) {
 	// Buffer for random, secret message
-	var ptext = make([]byte, pub.params.MsgLen)
-	// r = G(ptext||pub)
-	var r = make([]byte, pub.params.A.SecretByteLen)
-	// Resulting shared secret
-	secret = make([]byte, pub.params.KemSize)
+	ptext := make([]byte, pub.params.MsgLen)
+	// SHA256 hash context object
+	d := sha256.New()
 
 	// Generate ephemeral value
 	_, err = io.ReadFull(rng, ptext)
@@ -633,13 +611,12 @@
 		return nil, nil, err
 	}
 
-	// must be big enough to store ptext+c0+c1
-	var hmac_key = make([]byte, pub.Size()+2*Params.MsgLen)
-	copy(hmac_key, ptext)
-	copy(hmac_key[len(ptext):], pub.Export())
-	hashMac(r, hmac_key[:len(ptext)+pub.Size()], G)
-	// Ensure bitlength is not bigger than to 2^e2-1
-	r[len(r)-1] &= (1 << (pub.params.A.SecretBitLen % 8)) - 1
+	// Implementation uses first 28-bytes of secret
+	d.Write(ptext)
+	d.Write(pub.Export())
+	digest := d.Sum(nil)
+	// r = G(ptext||pub)
+	r := digest[:pub.params.A.SecretByteLen]
 
 	// (c0 || c1) = Enc(pkA, ptext; r)
 	skA := NewPrivateKey(KeyVariant_SIDH_A)
@@ -655,10 +632,11 @@
 	}
 
 	// K = H(ptext||(c0||c1))
-	copy(hmac_key, ptext)
-	copy(hmac_key[len(ptext):], ctext)
-	hashMac(secret, hmac_key[:len(ptext)+len(ctext)], H)
-	return ctext, secret, nil
+	d.Reset()
+	d.Write(ptext)
+	d.Write(ctext)
+	digest = d.Sum(digest[:0])
+	return ctext, digest[:pub.params.KemSize], nil
 }
 
 // Decapsulate given the keypair and ciphertext as inputs, Decapsulate outputs a shared
@@ -666,10 +644,9 @@
 // Decapsulation may fail in case input is wrongly formatted.
 // Constant time for properly initialized input.
 func Decapsulate(prv *PrivateKey, pub *PublicKey, ctext []byte) ([]byte, error) {
-	var r = make([]byte, pub.params.A.SecretByteLen)
-	// Resulting shared secret
-	var secret = make([]byte, pub.params.KemSize)
 	var skA = NewPrivateKey(KeyVariant_SIDH_A)
+	// SHA256 hash context object
+	d := sha256.New()
 
 	m, err := Decrypt(prv, ctext)
 	if err != nil {
@@ -677,33 +654,30 @@
 	}
 
 	// r' = G(m'||pub)
-	var hmac_key = make([]byte, pub.Size()+2*Params.MsgLen)
-	copy(hmac_key, m)
-	copy(hmac_key[len(m):], pub.Export())
-	hashMac(r, hmac_key[:len(m)+pub.Size()], G)
-	// Ensure bitlength is not bigger than 2^e2-1
-	r[len(r)-1] &= (1 << (pub.params.A.SecretBitLen % 8)) - 1
-
+	d.Write(m)
+	d.Write(pub.Export())
+	digest := d.Sum(nil)
 	// Never fails
-	skA.Import(r)
+	skA.Import(digest[:pub.params.A.SecretByteLen])
 
 	// Never fails
 	pkA := skA.GeneratePublicKey()
 	c0 := pkA.Export()
 
+	d.Reset()
 	if subtle.ConstantTimeCompare(c0, ctext[:len(c0)]) == 1 {
-		copy(hmac_key, m)
+		d.Write(m)
 	} else {
-		// S is chosen at random when generating a key and unknown to other party. It
+		// S is chosen at random when generating a key and is unknown to the other party. It
 		// may seem weird, but it's correct. It is important that S is unpredictable
 		// to other party. Without this check, it is possible to recover a secret, by
 		// providing series of invalid ciphertexts. It is also important that in case
 		//
 		// See more details in "On the security of supersingular isogeny cryptosystems"
 		// (S. Galbraith, et al., 2016, ePrint #859).
-		copy(hmac_key, prv.S)
+		d.Write(prv.S)
 	}
-	copy(hmac_key[len(m):], ctext)
-	hashMac(secret, hmac_key[:len(m)+len(ctext)], H)
-	return secret, nil
+	d.Write(ctext)
+	digest = d.Sum(digest[:0])
+	return digest[:pub.params.KemSize], nil
 }
diff --git a/ssl/test/runner/sike/sike_test.go b/ssl/test/runner/sike/sike_test.go
index 2813504..2e146bc 100644
--- a/ssl/test/runner/sike/sike_test.go
+++ b/ssl/test/runner/sike/sike_test.go
@@ -28,41 +28,55 @@
 	name     string
 	PrB_sidh string
 	PkB_sidh string
+	PrA_sidh string
+	PkA_sidh string
 	PkB_sike string
 	PrB_sike string
-	PrA_sike string
-	PkA_sike string
 }{
-	name:     "P-503",
-	PkB_sike: "68460C22466E95864CFEA7B5D9077E768FF4F9ED69AE56D7CF3F236FB06B31020EEE34B5B572CEA5DDF20B531966AA8F5F3ACC0C6D1CE04EEDC30FD1F1233E2D96FE60C6D638FC646EAF2E2246F1AEC96859CE874A1F029A78F9C978CD6B22114A0D5AB20101191FD923E80C76908B1498B9D0200065CCA09159A0C65A1E346CC6470314FE78388DAA89DD08EC67DBE63C1F606674ACC49EBF9FDBB2B898B3CE733113AA6F942DB401A76D629CE6EE6C0FDAF4CFB1A5E366DB66C17B3923A1B7FB26A3FF25B9018869C674D3DEF4AF269901D686FE4647F9D2CDB2CEB3AFA305B27C885F037ED167F595066C21E7DD467D8332B934A5102DA5F13332DFA356B82156A0BB2E7E91C6B85B7D1E381BC9E3F0FC4DB9C36016D9ECEC415D7E977E9AC29910D934BA2FE4EE49D3B387607A4E1AFABF495FB86A77194626589E802FF5167C7A25C542C1EAD25A6E0AA931D94F2F9AFD3DBDF222E651F729A90E77B20974905F1E65E041CE6C95AAB3E1F22D332E0A5DE9C5DB3D9C7A38",
-	PrB_sike: "80FC55DA74DEFE3113487B80841E678AF9ED4E0599CF07353A4AB93971C090A0" +
-		"A9402C9DC98AC6DC8F5FDE5E970AE22BA48A400EFC72851C",
-	PrB_sidh: "A885A8B889520A6DBAD9FB33365E5B77FDED629440A16A533F259A510F63A822",
-	PrA_sike: "B0AD510708F4ABCF3E0D97DC2F2FF112D9D2AAE49D97FFD1E4267F21C6E71C03",
-	PkA_sike: "A6BADBA04518A924B20046B59AC197DCDF0EA48014C9E228C4994CCA432F360E" +
-		"2D527AFB06CA7C96EE5CEE19BAD53BF9218A3961CAD7EC092BD8D9EBB22A3D51" +
-		"33008895A3F1F6A023F91E0FE06A00A622FD6335DAC107F8EC4283DC2632F080" +
-		"4E64B390DAD8A2572F1947C67FDF4F8787D140CE2C6B24E752DA9A195040EDFA" +
-		"C27333FAE97DBDEB41DA9EEB2DB067AE7DA8C58C0EF57AEFC18A3D6BD0576FF2" +
-		"F1CFCAEC50C958331BF631F3D2E769790C7B6DF282B74BBC02998AD10F291D47" +
-		"C5A762FF84253D3B3278BDF20C8D4D4AA317BE401B884E26A1F02C7308AADB68" +
-		"20EBDB0D339F5A63346F3B40CACED72F544DAF51566C6E807D0E6E1E38514342" +
-		"432661DC9564DA07548570E256688CD9E8060D8775F95D501886D958588CACA0" +
-		"9F2D2AE1913F996E76AF63E31A179A7A7D2A46EDA03B2BCCF9020A5AA15F9A28" +
-		"9340B33F3AE7F97360D45F8AE1B9DD48779A57E8C45B50A02C00349CD1C58C55" +
-		"1D68BC2A75EAFED944E8C599C288037181E997471352E24C952B",
-	PkB_sidh: "244AF1F367C2C33912750A98497CC8214BC195BD52BD76513D32ACE4B75E31F0" +
-		"281755C265F5565C74E3C04182B9C244071859C8588CC7F09547CEFF8F7705D2" +
-		"60CE87D6BFF914EE7DBE4B9AF051CA420062EEBDF043AF58184495026949B068" +
-		"98A47046BFAE8DF3B447746184AF550553BB5D266D6E1967ACA33CAC5F399F90" +
-		"360D70867F2C71EF6F94FF915C7DA8BC9549FB7656E691DAEFC93CF56876E482" +
-		"CA2F8BE2D6CDCC374C31AD8833CABE997CC92305F38497BEC4DFD1821B004FEC" +
-		"E16448F9A24F965EFE409A8939EEA671633D9FFCF961283E59B8834BDF7EDDB3" +
-		"05D6275B61DA6692325432A0BAA074FC7C1F51E76208AB193A57520D40A76334" +
-		"EE5712BDC3E1EFB6103966F2329EDFF63082C4DFCDF6BE1C5A048630B81871B8" +
-		"83B735748A8FD4E2D9530C272163AB18105B10015CA7456202FE1C9B92CEB167" +
-		"5EAE1132E582C88E47ED87B363D45F05BEA714D5E9933D7AF4071CBB5D49008F" +
-		"3E3DAD7DFF935EE509D5DE561842B678CCEB133D62E270E9AC3E",
+	name:     "P-434",
+	PrA_sidh: "3A727E04EA9B7E2A766A6F846489E7E7B915263BCEED308BB10FC9",
+	PkA_sidh: "9E668D1E6750ED4B91EE052C32839CA9DD2E56D52BC24DECC950AA" +
+		"AD24CEED3F9049C77FE80F0B9B01E7F8DAD7833EEC2286544D6380" +
+		"009C379CDD3E7517CEF5E20EB01F8231D52FC30DC61D2F63FB357F" +
+		"85DC6396E8A95DB9740BD3A972C8DB7901B31F074CD3E45345CA78" +
+		"F900817130E688A29A7CF0073B5C00FF2C65FBE776918EF9BD8E75" +
+		"B29EF7FAB791969B60B0C5B37A8992EDEF95FA7BAC40A95DAFE02E" +
+		"237301FEE9A7A43FD0B73477E8035DD12B73FAFEF18D39904DDE36" +
+		"53A754F36BE1888F6607C6A7951349A414352CF31A29F2C40302DB" +
+		"406C48018C905EB9DC46AFBF42A9187A9BB9E51B587622A2862DC7" +
+		"D5CC598BF38ED6320FB51D8697AD3D7A72ABCC32A393F0133DA8DF" +
+		"5E253D9E00B760B2DF342FCE974DCFE946CFE4727783531882800F" +
+		"9E5DD594D6D5A6275EEFEF9713ED838F4A06BB34D7B8D46E0B385A" +
+		"AEA1C7963601",
+	PrB_sidh: "E37BFE55B43B32448F375903D8D226EC94ADBFEA1D2B3536EB987001",
+	PkB_sidh: "C9F73E4497AAA3FDF9EB688135866A8A83934BA10E273B8CC3808C" +
+		"F0C1F5FAB3E9BB295885881B73DEBC875670C0F51C4BB40DF5FEDE" +
+		"01B8AF32D1BF10508B8C17B2734EB93B2B7F5D84A4A0F2F816E9E2" +
+		"C32AC253C0B6025B124D05A87A9E2A8567930F44BAA14219B941B6" +
+		"B400B4AED1D796DA12A5A9F0B8F3F5EE9DD43F64CB24A3B1719DF2" +
+		"78ADF56B5F3395187829DA2319DEABF6BBD6EDA244DE2B62CC5AC2" +
+		"50C1009DD1CD4712B0B37406612AD002B5E51A62B51AC9C0374D14" +
+		"3ABBBD58275FAFC4A5E959C54838C2D6D9FB43B7B2609061267B6A" +
+		"2E6C6D01D295C4223E0D3D7A4CDCFB28A7818A737935279751A6DD" +
+		"8290FD498D1F6AD5F4FFF6BDFA536713F509DCE8047252F1E7D0DD" +
+		"9FCC414C0070B5DCCE3665A21A032D7FBE749181032183AFAD240B" +
+		"7E671E87FBBEC3A8CA4C11AA7A9A23AC69AE2ACF54B664DECD2775" +
+		"3D63508F1B02",
+	PrB_sike: "4B622DE1350119C45A9F2E2EF3DC5DF56A27FCDFCDDAF58CD69B90" +
+		"3752D68C200934E160B234E49EDE247601",
+	PkB_sike: "1BD0A2E81307B6F96461317DDF535ACC0E59C742627BAE60D27605" +
+		"E10FAF722D22A73E184CB572A12E79DCD58C6B54FB01442114CBE9" +
+		"010B6CAEC25D04C16C5E42540C1524C545B8C67614ED4183C9FA5B" +
+		"D0BE45A7F89FBC770EE8E7E5E391C7EE6F35F74C29E6D9E35B1663" +
+		"DA01E48E9DEB2347512D366FDE505161677055E3EF23054D276E81" +
+		"7E2C57025DA1C10D2461F68617F2D11256EEE4E2D7DBDF6C8E34F3" +
+		"A0FD00C625428CB41857002159DAB94267ABE42D630C6AAA91AF83" +
+		"7C7A6740754EA6634C45454C51B0BB4D44C3CCCCE4B32C00901CF6" +
+		"9C008D013348379B2F9837F428A01B6173584691F2A6F3A3C4CF48" +
+		"7D20D261B36C8CDB1BC158E2A5162A9DA4F7A97AA0879B9897E2B6" +
+		"891B672201F9AEFBF799C27B2587120AC586A511360926FB7DA8EB" +
+		"F5CB5272F396AE06608422BE9792E2CE9BEF21BF55B7EFF8DC7EC8" +
+		"C99910D3F800",
 }
 
 /* -------------------------------------------------------------------------
@@ -70,6 +84,7 @@
    -------------------------------------------------------------------------*/
 // Fail if err !=nil. Display msg as an error message
 func checkErr(t testing.TB, err error, msg string) {
+	t.Helper()
 	if err != nil {
 		t.Error(msg)
 	}
@@ -110,9 +125,9 @@
    Unit tests
    -------------------------------------------------------------------------*/
 func TestKeygen(t *testing.T) {
-	alicePrivate := convToPrv(tdata.PrA_sike, KeyVariant_SIDH_A)
+	alicePrivate := convToPrv(tdata.PrA_sidh, KeyVariant_SIDH_A)
 	bobPrivate := convToPrv(tdata.PrB_sidh, KeyVariant_SIDH_B)
-	expPubA := convToPub(tdata.PkA_sike, KeyVariant_SIDH_A)
+	expPubA := convToPub(tdata.PkA_sidh, KeyVariant_SIDH_A)
 	expPubB := convToPub(tdata.PkB_sidh, KeyVariant_SIDH_B)
 
 	pubA := alicePrivate.GeneratePublicKey()
@@ -132,7 +147,7 @@
 	b := NewPublicKey(KeyVariant_SIDH_B)
 
 	// Import keys
-	a_hex, err := hex.DecodeString(tdata.PkA_sike)
+	a_hex, err := hex.DecodeString(tdata.PkA_sidh)
 	checkErr(t, err, "invalid hex-number provided")
 
 	err = a.Import(a_hex)
@@ -210,7 +225,7 @@
 	}
 
 	// Negative case
-	dec, e := hex.DecodeString(tdata.PkA_sike)
+	dec, e := hex.DecodeString(tdata.PkA_sidh)
 	if e != nil {
 		t.FailNow()
 	}
@@ -387,6 +402,7 @@
 
 	// calculated shared secret
 	ct, ss_e, err := Encapsulate(rand.Reader, pk)
+
 	checkErr(t, err, "encapsulation failed")
 	ss_d, err := Decapsulate(sk, pk, ct)
 	checkErr(t, err, "decapsulation failed")
@@ -512,115 +528,106 @@
 }
 
 func TestKeyAgreement(t *testing.T) {
-	testKeyAgreement(t, tdata.PkA_sike, tdata.PrA_sike, tdata.PkB_sidh, tdata.PrB_sidh)
+	testKeyAgreement(t, tdata.PkA_sidh, tdata.PrA_sidh, tdata.PkB_sidh, tdata.PrB_sidh)
 }
 
 // Same values as in sike_test.cc
 func TestDecapsulation(t *testing.T) {
-
-	var sk = [56]byte{
-		0xDB, 0xAF, 0x2C, 0x89, 0xCA, 0x5A, 0xD4, 0x9D, 0x4F, 0x13,
-		0x40, 0xDF, 0x2D, 0xB1, 0x5F, 0x4C, 0x91, 0xA7, 0x1F, 0x0B,
-		0x29, 0x15, 0x01, 0x59, 0xBC, 0x5F, 0x0B, 0x4A, 0x03, 0x27,
-		0x6F, 0x18}
-
-	var pk = []byte{
-		0x07, 0xAA, 0x51, 0x45, 0x3E, 0x1F, 0x53, 0x2A, 0x0A, 0x05,
-		0x46, 0xF6, 0x54, 0x7F, 0x5D, 0x56, 0xD6, 0x76, 0xD3, 0xEA,
-		0x4B, 0x6B, 0x01, 0x9B, 0x11, 0x72, 0x6F, 0x75, 0xEA, 0x34,
-		0x3C, 0x28, 0x2C, 0x36, 0xFD, 0x77, 0xDA, 0xBE, 0xB6, 0x20,
-		0x18, 0xC1, 0x93, 0x98, 0x18, 0x86, 0x30, 0x2F, 0x2E, 0xD2,
-		0x00, 0x61, 0xFF, 0xAE, 0x78, 0xAE, 0xFB, 0x6F, 0x32, 0xAC,
-		0x06, 0xBF, 0x35, 0xF6, 0xF7, 0x5B, 0x98, 0x26, 0x95, 0xC2,
-		0xD8, 0xD6, 0x1C, 0x0E, 0x47, 0xDA, 0x76, 0xCE, 0xB5, 0xF1,
-		0x19, 0xCC, 0x01, 0xE1, 0x17, 0xA9, 0x62, 0xF7, 0x82, 0x6C,
-		0x25, 0x51, 0x25, 0xAE, 0xFE, 0xE3, 0xE2, 0xE1, 0x35, 0xAE,
-		0x2E, 0x8F, 0x38, 0xE0, 0x7C, 0x74, 0x3C, 0x1D, 0x39, 0x91,
-		0x1B, 0xC7, 0x9F, 0x8E, 0x33, 0x4E, 0x84, 0x19, 0xB8, 0xD9,
-		0xC2, 0x71, 0x35, 0x02, 0x47, 0x3E, 0x79, 0xEF, 0x47, 0xE1,
-		0xD8, 0x21, 0x96, 0x1F, 0x11, 0x59, 0x39, 0x34, 0x76, 0xEF,
-		0x3E, 0xB7, 0x4E, 0xFB, 0x7C, 0x55, 0xA1, 0x85, 0xAA, 0xAB,
-		0xAD, 0xF0, 0x09, 0xCB, 0xD1, 0xE3, 0x7C, 0x4F, 0x5D, 0x2D,
-		0xE1, 0x13, 0xF0, 0x71, 0xD9, 0xE5, 0xF6, 0xAF, 0x7F, 0xC1,
-		0x27, 0x95, 0x8D, 0x52, 0xD5, 0x96, 0x42, 0x38, 0x41, 0xF7,
-		0x24, 0x3F, 0x3A, 0xB5, 0x7E, 0x11, 0xE4, 0xF9, 0x33, 0xEE,
-		0x4D, 0xBE, 0x74, 0x48, 0xF9, 0x98, 0x04, 0x01, 0x16, 0xEB,
-		0xA9, 0x0D, 0x61, 0xC6, 0xFD, 0x4C, 0xCF, 0x98, 0x84, 0x4A,
-		0x94, 0xAC, 0x69, 0x2C, 0x02, 0x8B, 0xE3, 0xD1, 0x41, 0x0D,
-		0xF2, 0x2D, 0x46, 0x1F, 0x57, 0x1C, 0x77, 0x86, 0x18, 0xE3,
-		0x63, 0xDE, 0xF3, 0xE3, 0x02, 0x30, 0x54, 0x73, 0xAE, 0xC2,
-		0x32, 0xA2, 0xCE, 0xEB, 0xCF, 0x81, 0x46, 0x54, 0x5C, 0xF4,
-		0x5D, 0x2A, 0x03, 0x5D, 0x9C, 0xAE, 0xE0, 0x60, 0x03, 0x80,
-		0x11, 0x30, 0xA5, 0xAA, 0xD1, 0x75, 0x67, 0xE0, 0x1C, 0x2B,
-		0x6B, 0x5D, 0x83, 0xDE, 0x92, 0x9B, 0x0E, 0xD7, 0x11, 0x0F,
-		0x00, 0xC4, 0x59, 0xE4, 0x81, 0x04, 0x3B, 0xEE, 0x5C, 0x04,
-		0xD1, 0x0E, 0xD0, 0x67, 0xF5, 0xCC, 0xAA, 0x72, 0x73, 0xEA,
-		0xC4, 0x76, 0x99, 0x3B, 0x4C, 0x90, 0x2F, 0xCB, 0xD8, 0x0A,
-		0x5B, 0xEC, 0x0E, 0x0E, 0x1F, 0x59, 0xEA, 0x14, 0x8D, 0x34,
-		0x53, 0x65, 0x4C, 0x1A, 0x59, 0xA8, 0x95, 0x66, 0x60, 0xBB,
-		0xC4, 0xCC, 0x32, 0xA9, 0x8D, 0x2A, 0xAA, 0x14, 0x6F, 0x0F,
-		0x81, 0x4D, 0x32, 0x02, 0xFD, 0x33, 0x58, 0x42, 0xCF, 0xF3,
-		0x67, 0xD0, 0x9F, 0x0B, 0xB1, 0xCC, 0x18, 0xA5, 0xC4, 0x19,
-		0xB6, 0x00, 0xED, 0xFA, 0x32, 0x1A, 0x5F, 0x67, 0xC8, 0xC3,
-		0xEB, 0x0D, 0xB5, 0x9A, 0x36, 0x47, 0x82, 0x00,
+	var sk = [16 + 28]byte{
+		0x04, 0x5E, 0x01, 0x42, 0xB8, 0x2F, 0xE1, 0x9A, 0x38, 0x25,
+		0x92, 0xE7, 0xDC, 0xBA, 0xF7, 0x1B, 0xB1, 0xFD, 0x34, 0x42,
+		0xDB, 0x02, 0xBC, 0x9D, 0x4C, 0xD0, 0x72, 0x34, 0x4D, 0xBD,
+		0x06, 0xDF, 0x1C, 0x7D, 0x0A, 0x88, 0xB2, 0x50, 0xC4, 0xF6,
+		0xAE, 0xE8, 0x25, 0x01,
 	}
 
-	var ct = []byte{
-		0xE6, 0xB7, 0xE5, 0x7B, 0xA9, 0x19, 0xD1, 0x2C, 0xB8, 0x5C,
-		0x7B, 0x66, 0x74, 0xB0, 0x71, 0xA1, 0xFF, 0x71, 0x7F, 0x4B,
-		0xB5, 0xA6, 0xAF, 0x48, 0x32, 0x52, 0xD5, 0x82, 0xEE, 0x8A,
-		0xBB, 0x08, 0x1E, 0xF6, 0xAC, 0x91, 0xA2, 0xCB, 0x6B, 0x6A,
-		0x09, 0x2B, 0xD9, 0xC6, 0x27, 0xD6, 0x3A, 0x6B, 0x8D, 0xFC,
-		0xB8, 0x90, 0x8F, 0x72, 0xB3, 0xFA, 0x7D, 0x34, 0x7A, 0xC4,
-		0x7E, 0xE3, 0x30, 0xC5, 0xA0, 0xFE, 0x3D, 0x43, 0x14, 0x4E,
-		0x3A, 0x14, 0x76, 0x3E, 0xFB, 0xDF, 0xE3, 0xA8, 0xE3, 0x5E,
-		0x38, 0xF2, 0xE0, 0x39, 0x67, 0x60, 0xFD, 0xFB, 0xB4, 0x19,
-		0xCD, 0xE1, 0x93, 0xA2, 0x06, 0xCC, 0x65, 0xCD, 0x6E, 0xC8,
-		0xB4, 0x5E, 0x41, 0x4B, 0x6C, 0xA5, 0xF4, 0xE4, 0x9D, 0x52,
-		0x8C, 0x25, 0x60, 0xDD, 0x3D, 0xA9, 0x7F, 0xF2, 0x88, 0xC1,
-		0x0C, 0xEE, 0x97, 0xE0, 0xE7, 0x3B, 0xB7, 0xD3, 0x6F, 0x28,
-		0x79, 0x2F, 0x50, 0xB2, 0x4F, 0x74, 0x3A, 0x0C, 0x88, 0x27,
-		0x98, 0x3A, 0x27, 0xD3, 0x26, 0x83, 0x59, 0x49, 0x81, 0x5B,
-		0x0D, 0xA7, 0x0C, 0x4F, 0xEF, 0xFB, 0x1E, 0xAF, 0xE9, 0xD2,
-		0x1C, 0x10, 0x25, 0xEC, 0x9E, 0xFA, 0x57, 0x36, 0xAA, 0x3F,
-		0xC1, 0xA3, 0x2C, 0xE9, 0xB5, 0xC9, 0xED, 0x72, 0x51, 0x4C,
-		0x02, 0xB4, 0x7B, 0xB3, 0xED, 0x9F, 0x45, 0x03, 0x34, 0xAC,
-		0x9A, 0x9E, 0x62, 0x5F, 0x82, 0x7A, 0x77, 0x34, 0xF9, 0x21,
-		0x94, 0xD2, 0x38, 0x3D, 0x05, 0xF0, 0x8A, 0x60, 0x1C, 0xB7,
-		0x1D, 0xF5, 0xB7, 0x53, 0x77, 0xD3, 0x9D, 0x3D, 0x70, 0x6A,
-		0xCB, 0x18, 0x20, 0x6B, 0x29, 0x17, 0x3A, 0x6D, 0xA1, 0xB2,
-		0x64, 0xDB, 0x6C, 0xE6, 0x1A, 0x95, 0xA7, 0xF4, 0x1A, 0x78,
-		0x1D, 0xA2, 0x40, 0x15, 0x41, 0x59, 0xDD, 0xEE, 0x23, 0x57,
-		0xCE, 0x36, 0x0D, 0x55, 0xBD, 0xB8, 0xFD, 0x0F, 0x35, 0xBD,
-		0x5B, 0x92, 0xD6, 0x1C, 0x84, 0x8C, 0x32, 0x64, 0xA6, 0x5C,
-		0x45, 0x18, 0x07, 0x6B, 0xF9, 0xA9, 0x43, 0x9A, 0x83, 0xCD,
-		0xB5, 0xB3, 0xD9, 0x17, 0x99, 0x2C, 0x2A, 0x8B, 0xE0, 0x8E,
-		0xAF, 0xA6, 0x4C, 0x95, 0xBB, 0x70, 0x60, 0x1A, 0x3A, 0x97,
-		0xAA, 0x2F, 0x3D, 0x22, 0x83, 0xB7, 0x4F, 0x59, 0xED, 0x3F,
-		0x4E, 0xF4, 0x19, 0xC6, 0x25, 0x0B, 0x0A, 0x5E, 0x21, 0xB9,
-		0x91, 0xB8, 0x19, 0x84, 0x48, 0x78, 0xCE, 0x27, 0xBF, 0x41,
-		0x89, 0xF6, 0x30, 0xFD, 0x6B, 0xD9, 0xB8, 0x1D, 0x72, 0x8A,
-		0x56, 0xCC, 0x2F, 0x82, 0xE4, 0x46, 0x4D, 0x75, 0xD8, 0x92,
-		0xE6, 0x9C, 0xCC, 0xD2, 0xCD, 0x35, 0xE4, 0xFC, 0x2A, 0x85,
-		0x6B, 0xA9, 0xB2, 0x27, 0xC9, 0xA1, 0xFF, 0xB3, 0x96, 0x3E,
-		0x59, 0xF6, 0x4C, 0x66, 0x56, 0x2E, 0xF5, 0x1B, 0x97, 0x32,
-		0xB0, 0x71, 0x5A, 0x9C, 0x50, 0x4B, 0x6F, 0xC4, 0xCA, 0x94,
-		0x75, 0x37, 0x46, 0x10, 0x12, 0x2F, 0x4F, 0xA3, 0x82, 0xCD,
-		0xBD, 0x7C,
+	var pk = [330]byte{
+		0x6D, 0x8D, 0xF5, 0x7B, 0xCD, 0x47, 0xCA, 0xCB, 0x7A, 0x38,
+		0xB7, 0xA6, 0x90, 0xB7, 0x37, 0x03, 0xD4, 0x6F, 0x27, 0x73,
+		0x74, 0x17, 0x5A, 0xA4, 0x0D, 0xC6, 0x81, 0xAD, 0xDB, 0xF7,
+		0x18, 0xB2, 0x3C, 0x30, 0xCF, 0xAA, 0x08, 0x11, 0x91, 0xCC,
+		0x27, 0x4E, 0xF1, 0xA6, 0xB7, 0xDA, 0xD2, 0xCF, 0x99, 0x7F,
+		0xF7, 0xE1, 0xD0, 0xCE, 0x00, 0xD2, 0x4B, 0xA4, 0x33, 0xB4,
+		0x87, 0x01, 0x3F, 0x02, 0xF7, 0xF9, 0xDE, 0xC3, 0x60, 0x62,
+		0xDA, 0x3F, 0x74, 0xA9, 0x44, 0xBE, 0x19, 0xD5, 0x03, 0x2A,
+		0x79, 0x8C, 0xA7, 0xFF, 0xEA, 0xB3, 0xBB, 0xB5, 0xD4, 0x1D,
+		0x8F, 0x92, 0xCE, 0x62, 0x6E, 0x99, 0x24, 0xD7, 0x57, 0xFA,
+		0xCD, 0xB6, 0xE2, 0x8E, 0xFD, 0x22, 0x0E, 0x31, 0x21, 0x01,
+		0x8D, 0x79, 0xF8, 0x3E, 0x27, 0xEC, 0x43, 0x40, 0xDB, 0x82,
+		0xE5, 0xEB, 0x6C, 0x97, 0x66, 0x29, 0x15, 0x68, 0xB7, 0x4D,
+		0x84, 0xD1, 0x8A, 0x0B, 0x12, 0x36, 0x2C, 0x0C, 0x0A, 0x6E,
+		0x4E, 0xDE, 0xA5, 0x8A, 0xDE, 0x77, 0xDD, 0x70, 0x49, 0x73,
+		0xAC, 0x27, 0x6D, 0x8D, 0x25, 0x9A, 0xE4, 0x25, 0xE8, 0x95,
+		0x8F, 0xFE, 0x90, 0x3B, 0x00, 0x69, 0x20, 0xE8, 0x7C, 0xA5,
+		0xF5, 0x79, 0xC0, 0x61, 0x51, 0x91, 0x35, 0x25, 0x3F, 0x17,
+		0x2F, 0x70, 0x73, 0xF0, 0x89, 0xB5, 0xC8, 0x25, 0xB8, 0xE5,
+		0x7E, 0x34, 0xDD, 0x11, 0xE5, 0xD6, 0xC3, 0xD5, 0x29, 0x89,
+		0xC6, 0x2C, 0x99, 0x53, 0x1D, 0x2C, 0x77, 0xB0, 0xB6, 0xA1,
+		0xBD, 0x79, 0xFB, 0x4A, 0xC2, 0x48, 0x4C, 0x62, 0x51, 0x00,
+		0xE3, 0x91, 0x2A, 0xCB, 0x84, 0x03, 0x5D, 0x2D, 0xC8, 0x33,
+		0xE9, 0x14, 0xBF, 0x74, 0x21, 0xBC, 0xF4, 0x76, 0xE5, 0x42,
+		0xB8, 0xBD, 0xE2, 0xE7, 0x20, 0x95, 0x54, 0xF2, 0xED, 0xC0,
+		0x79, 0x38, 0x1E, 0xD2, 0xEA, 0x1A, 0x63, 0x85, 0xE7, 0x3A,
+		0xDA, 0xAD, 0xAB, 0x1B, 0x1E, 0x19, 0x9E, 0x73, 0xD0, 0x10,
+		0x2E, 0x38, 0xAC, 0x8B, 0x00, 0x6A, 0x30, 0x2C, 0x3D, 0x70,
+		0x8E, 0x39, 0x6D, 0xC0, 0x12, 0x61, 0x7D, 0x2A, 0x0A, 0x04,
+		0x95, 0x8E, 0x09, 0x3C, 0x7B, 0xEC, 0x2E, 0xBC, 0xE8, 0xE8,
+		0xE8, 0x37, 0x29, 0xC4, 0x7E, 0x76, 0x48, 0xB9, 0x3B, 0x72,
+		0xE5, 0x99, 0x9B, 0xF9, 0xE3, 0x99, 0x72, 0x3F, 0x35, 0x29,
+		0x85, 0xE0, 0xC8, 0xBF, 0xB1, 0x6B, 0xB1, 0x6E, 0x72, 0x00,
 	}
-	var ss_exp = []byte{
-		0x74, 0x3D, 0x25, 0x36, 0x00, 0x24, 0x63, 0x1A, 0x39, 0x1A,
-		0xB4, 0xAD, 0x01, 0x17, 0x78, 0xE9}
+
+	var ct = [330 + 16]byte{
+		0xFF, 0xEB, 0xEF, 0x4A, 0xC0, 0x57, 0x0F, 0x26, 0xAC, 0x76,
+		0xA8, 0xB0, 0xA3, 0x5D, 0x9C, 0xD9, 0x25, 0xD1, 0x7F, 0x92,
+		0x5D, 0xF4, 0x23, 0x34, 0xC3, 0x03, 0x10, 0xE1, 0xB0, 0x24,
+		0x9B, 0x44, 0x58, 0x26, 0x13, 0x56, 0x83, 0x43, 0x72, 0x69,
+		0x28, 0x0D, 0x55, 0x07, 0x1F, 0xDB, 0xC0, 0x23, 0x34, 0x83,
+		0x1A, 0x09, 0x9B, 0x80, 0x00, 0x64, 0x56, 0xDC, 0x79, 0x7A,
+		0xD2, 0xCE, 0x23, 0xC9, 0x72, 0x27, 0xFC, 0x8D, 0xAB, 0xBF,
+		0xD3, 0x17, 0xF6, 0x91, 0x7B, 0x15, 0x93, 0x83, 0x8A, 0x4F,
+		0x6C, 0xCA, 0x4A, 0x94, 0xDA, 0xC7, 0x9D, 0xB6, 0xD6, 0xBA,
+		0xBD, 0x81, 0x9A, 0x78, 0xE5, 0xE5, 0xBE, 0x17, 0xBC, 0xCB,
+		0xC8, 0x23, 0x80, 0x5F, 0x75, 0xF8, 0xDB, 0x51, 0x55, 0x00,
+		0x25, 0x33, 0x52, 0x64, 0xB2, 0xD6, 0xD8, 0x9A, 0x2A, 0x9E,
+		0x29, 0x99, 0x13, 0x33, 0xE2, 0xA7, 0x98, 0xAC, 0xD7, 0x79,
+		0x5C, 0x2F, 0xBA, 0x07, 0xC3, 0x03, 0x37, 0xD6, 0xE6, 0xB5,
+		0xA1, 0xF5, 0x29, 0xB6, 0xF6, 0xC0, 0x5C, 0x44, 0x68, 0x2B,
+		0x0B, 0xF5, 0x00, 0x01, 0x44, 0xD5, 0xCC, 0x23, 0xB5, 0x27,
+		0x4F, 0xCA, 0xB4, 0x05, 0x01, 0xF9, 0xD4, 0x41, 0xE0, 0xE1,
+		0x1E, 0xCF, 0xA9, 0xBC, 0x79, 0xD7, 0xD5, 0xF5, 0x3C, 0xE6,
+		0x93, 0xF4, 0x6C, 0x84, 0x5A, 0x2C, 0x4B, 0xE4, 0x91, 0xB2,
+		0xB2, 0xB8, 0xAD, 0x74, 0x9A, 0x69, 0x79, 0x4C, 0x84, 0xB7,
+		0xBF, 0xF1, 0x68, 0x4B, 0xAE, 0x0F, 0x7F, 0x45, 0x3B, 0x18,
+		0x3F, 0xFA, 0x00, 0x48, 0xE0, 0x3A, 0xE2, 0xC0, 0xAE, 0x00,
+		0xCE, 0x90, 0x28, 0xA4, 0x1B, 0xBE, 0xCA, 0x0C, 0x21, 0x29,
+		0x64, 0x30, 0x5E, 0x35, 0xAD, 0xFD, 0x83, 0x47, 0x40, 0x6D,
+		0x15, 0x56, 0xFC, 0xF8, 0x5F, 0xAB, 0x81, 0xFE, 0x6B, 0xE9,
+		0x6B, 0xED, 0x27, 0x35, 0x7C, 0xD8, 0x2C, 0xD4, 0xF2, 0x11,
+		0xE6, 0xAF, 0xDF, 0xB8, 0x91, 0x96, 0xEB, 0xF7, 0x4C, 0x8D,
+		0x70, 0x77, 0x90, 0x81, 0x00, 0x09, 0x19, 0x27, 0x8A, 0x9E,
+		0xB6, 0x1A, 0xE9, 0xAC, 0x6C, 0xC9, 0xF8, 0xEA, 0xA2, 0x34,
+		0xB8, 0xAC, 0xB3, 0xB3, 0x68, 0xA1, 0xB7, 0x29, 0x55, 0xCA,
+		0x40, 0x23, 0x92, 0x5C, 0x0C, 0x79, 0x6B, 0xD6, 0x9F, 0x5B,
+		0xD2, 0xE6, 0xAE, 0x04, 0xCB, 0xEC, 0xC7, 0x88, 0x18, 0xDB,
+		0x7A, 0xE6, 0xD6, 0xC9, 0x39, 0xFD, 0x93, 0x9B, 0xC8, 0x01,
+		0x6F, 0x3E, 0x6C, 0x90, 0x3E, 0x73, 0x76, 0x99, 0x7C, 0x48,
+		0xDA, 0x68, 0x48, 0x80, 0x2B, 0x63,
+	}
+	var ssExp = [16]byte{
+		0xA1, 0xF9, 0x5A, 0x67, 0xB9, 0x3D, 0x1E, 0x72, 0xE8, 0xC5,
+		0x71, 0xF1, 0x4C, 0xB2, 0xAA, 0x6D,
+	}
 
 	var prvObj = NewPrivateKey(KeyVariant_SIKE)
 	var pubObj = NewPublicKey(KeyVariant_SIKE)
 
-	if pubObj.Import(pk) != nil || prvObj.Import(sk[:]) != nil {
+	if pubObj.Import(pk[:]) != nil || prvObj.Import(sk[:]) != nil {
 		t.Error("Can't import one of the keys")
 	}
 
-	res, _ := Decapsulate(prvObj, pubObj, ct)
-	if !bytes.Equal(ss_exp, res) {
+	res, _ := Decapsulate(prvObj, pubObj, ct[:])
+	if !bytes.Equal(ssExp[:], res) {
 		t.Error("Wrong decapsulation result")
 	}
 }
@@ -629,10 +636,10 @@
    Benchmarking
    -------------------------------------------------------------------------*/
 
-func BenchmarkSidhKeyAgreementP503(b *testing.B) {
+func BenchmarkSidhKeyAgreement(b *testing.B) {
 	// KeyPairs
-	alicePublic := convToPub(tdata.PkA_sike, KeyVariant_SIDH_A)
-	alicePrivate := convToPrv(tdata.PrA_sike, KeyVariant_SIDH_A)
+	alicePublic := convToPub(tdata.PkA_sidh, KeyVariant_SIDH_A)
+	alicePrivate := convToPrv(tdata.PrA_sidh, KeyVariant_SIDH_A)
 	bobPublic := convToPub(tdata.PkB_sidh, KeyVariant_SIDH_B)
 	bobPrivate := convToPrv(tdata.PrB_sidh, KeyVariant_SIDH_B)
 
@@ -643,21 +650,21 @@
 	}
 }
 
-func BenchmarkAliceKeyGenPrvP503(b *testing.B) {
+func BenchmarkAliceKeyGenPrv(b *testing.B) {
 	prv := NewPrivateKey(KeyVariant_SIDH_A)
 	for n := 0; n < b.N; n++ {
 		prv.Generate(rand.Reader)
 	}
 }
 
-func BenchmarkBobKeyGenPrvP503(b *testing.B) {
+func BenchmarkBobKeyGenPrv(b *testing.B) {
 	prv := NewPrivateKey(KeyVariant_SIDH_B)
 	for n := 0; n < b.N; n++ {
 		prv.Generate(rand.Reader)
 	}
 }
 
-func BenchmarkAliceKeyGenPubP503(b *testing.B) {
+func BenchmarkAliceKeyGenPub(b *testing.B) {
 	prv := NewPrivateKey(KeyVariant_SIDH_A)
 	prv.Generate(rand.Reader)
 	for n := 0; n < b.N; n++ {
@@ -665,7 +672,7 @@
 	}
 }
 
-func BenchmarkBobKeyGenPubP503(b *testing.B) {
+func BenchmarkBobKeyGenPub(b *testing.B) {
 	prv := NewPrivateKey(KeyVariant_SIDH_B)
 	prv.Generate(rand.Reader)
 	for n := 0; n < b.N; n++ {
@@ -673,17 +680,17 @@
 	}
 }
 
-func BenchmarkSharedSecretAliceP503(b *testing.B) {
-	aPr := convToPrv(tdata.PrA_sike, KeyVariant_SIDH_A)
+func BenchmarkSharedSecretAlice(b *testing.B) {
+	aPr := convToPrv(tdata.PrA_sidh, KeyVariant_SIDH_A)
 	bPk := convToPub(tdata.PkB_sike, KeyVariant_SIDH_B)
 	for n := 0; n < b.N; n++ {
 		DeriveSecret(aPr, bPk)
 	}
 }
 
-func BenchmarkSharedSecretBobP503(b *testing.B) {
+func BenchmarkSharedSecretBob(b *testing.B) {
 	// m_B = 3*randint(0,3^238)
-	aPk := convToPub(tdata.PkA_sike, KeyVariant_SIDH_A)
+	aPk := convToPub(tdata.PkA_sidh, KeyVariant_SIDH_A)
 	bPr := convToPrv(tdata.PrB_sidh, KeyVariant_SIDH_B)
 	for n := 0; n < b.N; n++ {
 		DeriveSecret(bPr, aPk)
diff --git a/third_party/sike/P503.c b/third_party/sike/P503.c
deleted file mode 100644
index b8463e7..0000000
--- a/third_party/sike/P503.c
+++ /dev/null
@@ -1,100 +0,0 @@
-/********************************************************************************************
-* SIDH: an efficient supersingular isogeny cryptography library
-*
-* Abstract: supersingular isogeny parameters and generation of functions for P503
-*********************************************************************************************/
-
-#include "utils.h"
-
-// Parameters for isogeny system "SIKEp503"
-const struct params_t p503 = {
-    .prime = {
-        U64_TO_WORDS(0xFFFFFFFFFFFFFFFF), U64_TO_WORDS(0xFFFFFFFFFFFFFFFF),
-        U64_TO_WORDS(0xFFFFFFFFFFFFFFFF), U64_TO_WORDS(0xABFFFFFFFFFFFFFF),
-        U64_TO_WORDS(0x13085BDA2211E7A0), U64_TO_WORDS(0x1B9BF6C87B7E7DAF),
-        U64_TO_WORDS(0x6045C6BDDA77A4D0), U64_TO_WORDS(0x004066F541811E1E)
-    },
-    .prime_p1 = {
-        U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000),
-        U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0xAC00000000000000),
-        U64_TO_WORDS(0x13085BDA2211E7A0), U64_TO_WORDS(0x1B9BF6C87B7E7DAF),
-        U64_TO_WORDS(0x6045C6BDDA77A4D0), U64_TO_WORDS(0x004066F541811E1E)
-    },
-    .prime_x2 = {
-        U64_TO_WORDS(0xFFFFFFFFFFFFFFFE), U64_TO_WORDS(0xFFFFFFFFFFFFFFFF),
-        U64_TO_WORDS(0xFFFFFFFFFFFFFFFF), U64_TO_WORDS(0x57FFFFFFFFFFFFFF),
-        U64_TO_WORDS(0x2610B7B44423CF41), U64_TO_WORDS(0x3737ED90F6FCFB5E),
-        U64_TO_WORDS(0xC08B8D7BB4EF49A0), U64_TO_WORDS(0x0080CDEA83023C3C)
-    },
-    .A_gen = {
-        U64_TO_WORDS(0xE7EF4AA786D855AF), U64_TO_WORDS(0xED5758F03EB34D3B),
-        U64_TO_WORDS(0x09AE172535A86AA9), U64_TO_WORDS(0x237B9CC07D622723),
-        U64_TO_WORDS(0xE3A284CBA4E7932D), U64_TO_WORDS(0x27481D9176C5E63F),
-        U64_TO_WORDS(0x6A323FF55C6E71BF), U64_TO_WORDS(0x002ECC31A6FB8773),   // XPA0
-        U64_TO_WORDS(0x64D02E4E90A620B8), U64_TO_WORDS(0xDAB8128537D4B9F1),
-        U64_TO_WORDS(0x4BADF77B8A228F98), U64_TO_WORDS(0x0F5DBDF9D1FB7D1B),
-        U64_TO_WORDS(0xBEC4DB288E1A0DCC), U64_TO_WORDS(0xE76A8665E80675DB),
-        U64_TO_WORDS(0x6D6F252E12929463), U64_TO_WORDS(0x003188BD1463FACC),   // XPA1
-        U64_TO_WORDS(0xB79D41025DE85D56), U64_TO_WORDS(0x0B867DA9DF169686),
-        U64_TO_WORDS(0x740E5368021C827D), U64_TO_WORDS(0x20615D72157BF25C),
-        U64_TO_WORDS(0xFF1590013C9B9F5B), U64_TO_WORDS(0xC884DCADE8C16CEA),
-        U64_TO_WORDS(0xEBD05E53BF724E01), U64_TO_WORDS(0x0032FEF8FDA5748C),   // XQA0
-        U64_TO_WORDS(0x12E2E849AA0A8006), U64_TO_WORDS(0x41CF47008635A1E8),
-        U64_TO_WORDS(0x9CD720A70798AED7), U64_TO_WORDS(0x42A820B42FCF04CF),
-        U64_TO_WORDS(0x7BF9BAD32AAE88B1), U64_TO_WORDS(0xF619127A54090BBE),
-        U64_TO_WORDS(0x1CB10D8F56408EAA), U64_TO_WORDS(0x001D6B54C3C0EDEB),   // XRA0
-        U64_TO_WORDS(0x34DB54931CBAAC36), U64_TO_WORDS(0x420A18CB8DD5F0C4),
-        U64_TO_WORDS(0x32008C1A48C0F44D), U64_TO_WORDS(0x3B3BA772B1CFD44D),
-        U64_TO_WORDS(0xA74B058FDAF13515), U64_TO_WORDS(0x095FC9CA7EEC17B4),
-        U64_TO_WORDS(0x448E829D28F120F8), U64_TO_WORDS(0x00261EC3ED16A489)    // XRA1
-    },
-    .B_gen = {
-        U64_TO_WORDS(0x7EDE37F4FA0BC727), U64_TO_WORDS(0xF7F8EC5C8598941C),
-        U64_TO_WORDS(0xD15519B516B5F5C8), U64_TO_WORDS(0xF6D5AC9B87A36282),
-        U64_TO_WORDS(0x7B19F105B30E952E), U64_TO_WORDS(0x13BD8B2025B4EBEE),
-        U64_TO_WORDS(0x7B96D27F4EC579A2), U64_TO_WORDS(0x00140850CAB7E5DE),   // XPB0
-        U64_TO_WORDS(0x7764909DAE7B7B2D), U64_TO_WORDS(0x578ABB16284911AB),
-        U64_TO_WORDS(0x76E2BFD146A6BF4D), U64_TO_WORDS(0x4824044B23AA02F0),
-        U64_TO_WORDS(0x1105048912A321F3), U64_TO_WORDS(0xB8A2E482CF0F10C1),
-        U64_TO_WORDS(0x42FF7D0BE2152085), U64_TO_WORDS(0x0018E599C5223352),   // XPB1
-        U64_TO_WORDS(0x4256C520FB388820), U64_TO_WORDS(0x744FD7C3BAAF0A13),
-        U64_TO_WORDS(0x4B6A2DDDB12CBCB8), U64_TO_WORDS(0xE46826E27F427DF8),
-        U64_TO_WORDS(0xFE4A663CD505A61B), U64_TO_WORDS(0xD6B3A1BAF025C695),
-        U64_TO_WORDS(0x7C3BB62B8FCC00BD), U64_TO_WORDS(0x003AFDDE4A35746C),   // XQB0
-        U64_TO_WORDS(0x75601CD1E6C0DFCB), U64_TO_WORDS(0x1A9007239B58F93E),
-        U64_TO_WORDS(0xC1F1BE80C62107AC), U64_TO_WORDS(0x7F513B898F29FF08),
-        U64_TO_WORDS(0xEA0BEDFF43E1F7B2), U64_TO_WORDS(0x2C6D94018CBAE6D0),
-        U64_TO_WORDS(0x3A430D31BCD84672), U64_TO_WORDS(0x000D26892ECCFE83),   // XRB0
-        U64_TO_WORDS(0x1119D62AEA3007A1), U64_TO_WORDS(0xE3702AA4E04BAE1B),
-        U64_TO_WORDS(0x9AB96F7D59F990E7), U64_TO_WORDS(0xF58440E8B43319C0),
-        U64_TO_WORDS(0xAF8134BEE1489775), U64_TO_WORDS(0xE7F7774E905192AA),
-        U64_TO_WORDS(0xF54AE09308E98039), U64_TO_WORDS(0x001EF7A041A86112)    // XRB1
-    },
-    .mont_R2 = {
-        U64_TO_WORDS(0x5289A0CF641D011F), U64_TO_WORDS(0x9B88257189FED2B9),
-        U64_TO_WORDS(0xA3B365D58DC8F17A), U64_TO_WORDS(0x5BC57AB6EFF168EC),
-        U64_TO_WORDS(0x9E51998BD84D4423), U64_TO_WORDS(0xBF8999CBAC3B5695),
-        U64_TO_WORDS(0x46E9127BCE14CDB6), U64_TO_WORDS(0x003F6CFCE8B81771)
-    },
-    .mont_one = {
-        U64_TO_WORDS(0x00000000000003F9), U64_TO_WORDS(0x0000000000000000),
-        U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0xB400000000000000),
-        U64_TO_WORDS(0x63CB1A6EA6DED2B4), U64_TO_WORDS(0x51689D8D667EB37D),
-        U64_TO_WORDS(0x8ACD77C71AB24142), U64_TO_WORDS(0x0026FBAEC60F5953)
-    },
-    .A_strat = {
-        61, 32, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1,
-        4, 2, 1, 1, 2, 1, 1, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1,
-        1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 29, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1,
-        1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 13, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2,
-        1, 1, 2, 1, 1, 5, 4, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1
-    },
-    .B_strat = {
-        71, 38, 21, 13, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 5, 4, 2, 1, 1, 2, 1,
-        1, 2, 1, 1, 1, 9, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1, 17, 9,
-        5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 1, 2, 1,
-        1, 4, 2, 1, 1, 2, 1, 1, 33, 17, 9, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1,
-        2, 1, 1, 8, 4, 2, 1, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 16, 8, 4, 2, 1, 1, 1, 2,
-        1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1
-    }
-};
diff --git a/third_party/sike/asm/fp-armv8.pl b/third_party/sike/asm/fp-armv8.pl
index a1728d1..ce19d80 100644
--- a/third_party/sike/asm/fp-armv8.pl
+++ b/third_party/sike/asm/fp-armv8.pl
@@ -2,7 +2,7 @@
 #
 # April 2019
 #
-# Abstract: field arithmetic in aarch64 assembly for SIDH/p503
+# Abstract: field arithmetic in aarch64 assembly for SIDH/p434
 
 $flavour = shift;
 $output  = shift;
@@ -21,21 +21,23 @@
 $code.=<<___;
 .section  .rodata
 
-.Lp503p1_nz_s8:
-    .quad  0x085BDA2211E7A0AC, 0x9BF6C87B7E7DAF13
-    .quad  0x45C6BDDA77A4D01B, 0x4066F541811E1E60
-
-.Lp503x2:
+# p434 x 2
+.Lp434x2:
     .quad  0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF
-    .quad  0x57FFFFFFFFFFFFFF, 0x2610B7B44423CF41
-    .quad  0x3737ED90F6FCFB5E, 0xC08B8D7BB4EF49A0
-    .quad  0x0080CDEA83023C3C
+    .quad  0xFB82ECF5C5FFFFFF, 0xF78CB8F062B15D47
+    .quad  0xD9F8BFAD038A40AC, 0x0004683E4E2EE688
+
+# p434 + 1
+.Lp434p1:
+    .quad  0xFDC1767AE3000000, 0x7BC65C783158AEA3
+    .quad  0x6CFC5FD681C52056, 0x0002341F27177344
 
 .text
 ___
 
-# C[0-2] = A[0] * B[0-1]
-sub mul64x128_comba_cut {
+# Computes C0-C2 = A0 * (B0-B1)
+# Inputs remain intact
+sub mul64x128 {
     my ($A0,$B0,$B1,$C0,$C1,$C2,$T0,$T1)=@_;
     my $body=<<___;
         mul     $T1, $A0, $B0
@@ -55,14 +57,161 @@
     return $body;
 }
 
-sub mul256_karatsuba_comba {
+# Computes C0-C4 = A0 * (B0-B3)
+# Inputs remain intact
+sub mul64x256 {
+    my ($A0,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$T0,$T1,$T2)=@_;
+    my $body=<<___;
+        mul     $C0, $A0, $B0    // C0
+        umulh   $T0, $A0, $B0
+
+        mul     $C1, $A0, $B1
+        umulh   $T1, $A0, $B1
+        adds    $C1, $C1, $T0    // C1
+        adc     $T0, xzr, xzr
+
+        mul     $C2, $A0, $B2
+        umulh   $T2, $A0, $B2
+        adds    $T1, $T0, $T1
+        adcs    $C2, $C2, $T1    // C2
+        adc     $T0, xzr, xzr
+
+        mul     $C3, $A0, $B3
+        umulh   $C4, $A0, $B3
+        adds    $T2, $T0, $T2
+        adcs    $C3, $C3, $T2    // C3
+        adc     $C4, $C4, xzr    // C4
+___
+    return $body;
+}
+
+# Computes C0-C4 = (A0-A1) * (B0-B3)
+# Inputs remain intact
+sub mul128x256 {
+    my ($A0,$A1,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$C5,$T0,$T1,$T2,$T3)=@_;
+    my $body=<<___;
+        mul     $C0, $A0, $B0  // C0
+        umulh   $C3, $A0, $B0
+
+        mul     $C1, $A0, $B1
+        umulh   $C2, $A0, $B1
+
+        mul     $T0, $A1, $B0
+        umulh   $T1, $A1, $B0
+        adds    $C1, $C1, $C3
+        adc     $C2, $C2, xzr
+
+        mul     $T2, $A0, $B2
+        umulh   $T3, $A0, $B2
+        adds    $C1, $C1, $T0  // C1
+        adcs    $C2, $C2, $T1
+        adc     $C3, xzr, xzr
+
+        mul     $T0, $A1, $B1
+        umulh   $T1, $A1, $B1
+        adds    $C2, $C2, $T2
+        adcs    $C3, $C3, $T3
+        adc     $C4, xzr, xzr
+
+        mul     $T2, $A0, $B3
+        umulh   $T3, $A0, $B3
+        adds    $C2, $C2, $T0  // C2
+        adcs    $C3, $C3, $T1
+        adc     $C4, $C4, xzr
+
+        mul     $T0, $A1, $B2
+        umulh   $T1, $A1, $B2
+        adds    $C3, $C3, $T2
+        adcs    $C4, $C4, $T3
+        adc     $C5, xzr, xzr
+
+        mul     $T2, $A1, $B3
+        umulh   $T3, $A1, $B3
+        adds    $C3, $C3, $T0  // C3
+        adcs    $C4, $C4, $T1
+        adc     $C5, $C5, xzr
+        adds    $C4, $C4, $T2  // C4
+        adc     $C5, $C5, $T3  // C5
+
+___
+    return $body;
+}
+
+# Computes C0-C5 = (A0-A2) * (B0-B2)
+# Inputs remain intact
+sub mul192 {
+    my ($A0,$A1,$A2,$B0,$B1,$B2,$C0,$C1,$C2,$C3,$C4,$C5,$T0,$T1,$T2,$T3)=@_;
+    my $body=<<___;
+
+        // A0 * B0
+        mul     $C0, $A0, $B0  // C0
+        umulh   $C3, $A0, $B0
+
+        // A0 * B1
+        mul     $C1, $A0, $B1
+        umulh   $C2, $A0, $B1
+
+        // A1 * B0
+        mul     $T0, $A1, $B0
+        umulh   $T1, $A1, $B0
+        adds    $C1, $C1, $C3
+        adc     $C2, $C2, xzr
+
+        // A0 * B2
+        mul     $T2, $A0, $B2
+        umulh   $T3, $A0, $B2
+        adds    $C1, $C1, $T0  // C1
+        adcs    $C2, $C2, $T1
+        adc     $C3, xzr, xzr
+
+        // A2 * B0
+        mul     $T0, $A2, $B0
+        umulh   $C4, $A2, $B0
+        adds    $C2, $C2, $T2
+        adcs    $C3, $C3, $C4
+        adc     $C4, xzr, xzr
+
+        // A1 * B1
+        mul     $T2, $A1, $B1
+        umulh   $T1, $A1, $B1
+        adds    $C2, $C2, $T0
+        adcs    $C3, $C3, $T3
+        adc     $C4, $C4, xzr
+
+        // A1 * B2
+        mul     $T0, $A1, $B2
+        umulh   $T3, $A1, $B2
+        adds    $C2, $C2, $T2 // C2
+        adcs    $C3, $C3, $T1
+        adc     $C4, $C4, xzr
+
+        // A2 * B1
+        mul     $T2, $A2, $B1
+        umulh   $T1, $A2, $B1
+        adds    $C3, $C3, $T0
+        adcs    $C4, $C4, $T3
+        adc     $C5, xzr, xzr
+
+        // A2 * B2
+        mul     $T0, $A2, $B2
+        umulh   $T3, $A2, $B2
+        adds    $C3, $C3, $T2 // C3
+        adcs    $C4, $C4, $T1
+        adc     $C5, $C5, xzr
+
+        adds    $C4, $C4, $T0 // C4
+        adc     $C5, $C5, $T3 // C5
+___
+    return $body;
+}
+sub mul256_karatsuba {
     my ($M,$A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$C5,$C6,$C7,$T0,$T1)=@_;
     # (AH+AL) x (BH+BL), low part
-    my $mul_low=&mul64x128_comba_cut($A1, $C6, $T1, $C3, $C4, $C5, $C7, $A0);
+    my $mul_low=&mul64x128($A1, $C6, $T1, $C3, $C4, $C5, $C7, $A0);
     # AL x BL
-    my $mul_albl=&mul64x128_comba_cut($A1, $B0, $B1, $C1, $T1, $C7, $C6, $A0);
+    my $mul_albl=&mul64x128($A1, $B0, $B1, $C1, $T1, $C7, $C6, $A0);
     # AH x BH
-    my $mul_ahbh=&mul64x128_comba_cut($A3, $B2, $B3, $A1, $C6, $B0, $B1, $A2);
+    my $mul_ahbh=&mul64x128($A3, $B2, $B3, $A1, $C6, $B0, $B1, $A2);
     my $body=<<___;
         // A0-A1 <- AH + AL, T0 <- mask
         adds    $A0, $A0, $A2
@@ -146,26 +295,25 @@
 # Operation: c [x2] = a [x0] * b [x1]
 sub mul {
     # (AH+AL) x (BH+BL), low part
-    my $mul_kc_low=&mul256_karatsuba_comba(
+    my $mul_kc_low=&mul256_karatsuba(
         "x2",                                           # M0
         "x3","x4","x5","x6",                            # A0-A3
-        "x11","x12","x13","x14",                        # B0-B3
-        "x8","x9","x10","x20","x21","x22","x23","x24",  # C0-C7
+        "x10","x11","x12","x13",                        # B0-B3
+        "x8","x9","x19","x20","x21","x22","x23","x24",  # C0-C7
         "x25","x26");                                   # TMP
     # AL x BL
-    my $mul_albl=&mul256_karatsuba_comba(
-        "x0",                                           # M0
+    my $mul_albl=&mul256_karatsuba(
+        "x0",                                           # M0f
         "x3","x4","x5","x6",                            # A0-A3
-        "x11","x12","x13","x14",                        # B0-B3
+        "x10","x11","x12","x13",                        # B0-B3
         "x21","x22","x23","x24","x25","x26","x27","x28",# C0-C7
         "x8","x9");                                     # TMP
     # AH x BH
-    my $mul_ahbh=&mul256_karatsuba_comba(
-        "x0",                                           # M0
-        "x3","x4","x5","x6",                            # A0-A3
-        "x11","x12","x13","x14",                        # B0-B3
-        "x21","x22","x23","x24","x25","x26","x27","x28",# C0-C7
-        "x8","x9");                                     # TMP
+    my $mul_ahbh=&mul192(
+        "x3","x4","x5",                                 # A0-A2
+        "x10","x11","x12",                              # B0-B2
+        "x21","x22","x23","x24","x25","x26",            # C0-C5
+        "x8","x9","x27","x28");                         # TMP
 
     my $body=<<___;
         .global ${PREFIX}_mpmul
@@ -179,27 +327,27 @@
         stp     x25, x26, [sp,#64]
         stp     x27, x28, [sp,#80]
 
-        ldp     x3, x4, [x0]
-        ldp     x5, x6, [x0,#16]
-        ldp     x7, x8, [x0,#32]
-        ldp     x9, x10, [x0,#48]
-        ldp     x11, x12, [x1,#0]
-        ldp     x13, x14, [x1,#16]
-        ldp     x15, x16, [x1,#32]
-        ldp     x17, x19, [x1,#48]
+        ldp      x3,  x4, [x0]
+        ldp      x5,  x6, [x0,#16]
+        ldp      x7,  x8, [x0,#32]
+        ldr      x9,      [x0,#48]
+        ldp     x10, x11, [x1,#0]
+        ldp     x12, x13, [x1,#16]
+        ldp     x14, x15, [x1,#32]
+        ldr     x16,      [x1,#48]
 
         // x3-x7 <- AH + AL, x7 <- carry
         adds    x3, x3, x7
         adcs    x4, x4, x8
         adcs    x5, x5, x9
-        adcs    x6, x6, x10
+        adcs    x6, x6, xzr
         adc     x7, xzr, xzr
 
-        // x11-x14 <- BH + BL, x8 <- carry
-        adds    x11, x11, x15
+        // x10-x13 <- BH + BL, x8 <- carry
+        adds    x10, x10, x14
+        adcs    x11, x11, x15
         adcs    x12, x12, x16
-        adcs    x13, x13, x17
-        adcs    x14, x14, x19
+        adcs    x13, x13, xzr
         adc     x8, xzr, xzr
 
         // x9 <- combined carry
@@ -208,12 +356,11 @@
         sub      x7, xzr, x7
         sub      x8, xzr, x8
 
-
         // x15-x19 <- masked (BH + BL)
+        and     x14, x10, x7
         and     x15, x11, x7
         and     x16, x12, x7
         and     x17, x13, x7
-        and     x19, x14, x7
 
         // x20-x23 <- masked (AH + AL)
         and     x20, x3, x8
@@ -222,46 +369,46 @@
         and     x23, x6, x8
 
         // x15-x19, x7 <- masked (AH+AL) + masked (BH+BL), step 1
-        adds    x15, x15, x20
-        adcs    x16, x16, x21
-        adcs    x17, x17, x22
-        adcs    x19, x19, x23
+        adds    x14, x14, x20
+        adcs    x15, x15, x21
+        adcs    x16, x16, x22
+        adcs    x17, x17, x23
         adc     x7, x9, xzr
 
-        // x8-x10,x20-x24 <- (AH+AL) x (BH+BL), low part
+        // x8-x9,x19,x20-x24 <- (AH+AL) x (BH+BL), low part
         stp     x3, x4, [x2,#0]
         $mul_kc_low
 
         // x15-x19, x7 <- (AH+AL) x (BH+BL), final step
-        adds    x15, x15, x21
-        adcs    x16, x16, x22
-        adcs    x17, x17, x23
-        adcs    x19, x19, x24
+        adds    x14, x14, x21
+        adcs    x15, x15, x22
+        adcs    x16, x16, x23
+        adcs    x17, x17, x24
         adc     x7, x7, xzr
 
         // Load AL
         ldp     x3, x4, [x0]
         ldp     x5, x6, [x0,#16]
         // Load BL
-        ldp     x11, x12, [x1,#0]
-        ldp     x13, x14, [x1,#16]
+        ldp     x10, x11, [x1,#0]
+        ldp     x12, x13, [x1,#16]
 
-        // Temporarily store x8,x9 in x2
-        stp     x8,x9, [x2,#0]
+        // Temporarily store x8 in x2
+        stp     x8, x9, [x2,#0]
         // x21-x28 <- AL x BL
         $mul_albl
-        // Restore x8,x9
-        ldp     x8,x9, [x2,#0]
+        // Restore x8
+        ldp     x8, x9, [x2,#0]
 
         // x8-x10,x20,x15-x17,x19 <- maskd (AH+AL) x (BH+BL) - ALxBL
         subs    x8, x8, x21
         sbcs    x9, x9, x22
-        sbcs    x10, x10, x23
+        sbcs    x19, x19, x23
         sbcs    x20, x20, x24
-        sbcs    x15, x15, x25
-        sbcs    x16, x16, x26
-        sbcs    x17, x17, x27
-        sbcs    x19, x19, x28
+        sbcs    x14, x14, x25
+        sbcs    x15, x15, x26
+        sbcs    x16, x16, x27
+        sbcs    x17, x17, x28
         sbc     x7, x7, xzr
 
         // Store ALxBL, low
@@ -270,14 +417,14 @@
 
         // Load AH
         ldp     x3, x4, [x0,#32]
-        ldp     x5, x6, [x0,#48]
+        ldr     x5,     [x0,#48]
         // Load BH
-        ldp     x11, x12, [x1,#32]
-        ldp     x13, x14, [x1,#48]
+        ldp     x10, x11, [x1,#32]
+        ldr     x12,      [x1,#48]
 
-        adds    x8, x8, x25
-        adcs    x9, x9, x26
-        adcs    x10, x10, x27
+        adds     x8,  x8, x25
+        adcs     x9,  x9, x26
+        adcs    x19, x19, x27
         adcs    x20, x20, x28
         adc     x1, xzr, xzr
 
@@ -291,35 +438,32 @@
 
         neg     x1, x1
 
-        // x8-x10,x20,x15-x17,x19 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
+        // x8-x9,x19,x20,x14-x17 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
         subs    x8, x8, x21
         sbcs    x9, x9, x22
-        sbcs    x10, x10, x23
+        sbcs    x19, x19, x23
         sbcs    x20, x20, x24
-        sbcs    x15, x15, x25
-        sbcs    x16, x16, x26
-        sbcs    x17, x17, x27
-        sbcs    x19, x19, x28
+        sbcs    x14, x14, x25
+        sbcs    x15, x15, x26
+        sbcs    x16, x16, xzr
+        sbcs    x17, x17, xzr
         sbc     x7, x7, xzr
 
         // Store (AH+AL) x (BH+BL) - ALxBL - AHxBH, low
-        stp     x8, x9, [x2,#32]
-        stp     x10, x20, [x2,#48]
+        stp      x8,  x9, [x2,#32]
+        stp     x19, x20, [x2,#48]
 
-        adds    x1, x1, #1
-        adcs    x15, x15, x21
-        adcs    x16, x16, x22
-        adcs    x17, x17, x23
-        adcs    x19, x19, x24
-        adcs    x25, x7, x25
-        adcs    x26, x26, xzr
-        adcs    x27, x27, xzr
-        adc     x28, x28, xzr
+        adds     x1,  x1, #1
+        adcs    x14, x14, x21
+        adcs    x15, x15, x22
+        adcs    x16, x16, x23
+        adcs    x17, x17, x24
+        adcs    x25,  x7, x25
+        adc     x26, x26, xzr
 
-        stp     x15, x16, [x2,#64]
-        stp     x17, x19, [x2,#80]
+        stp     x14, x15, [x2,#64]
+        stp     x16, x17, [x2,#80]
         stp     x25, x26, [x2,#96]
-        stp     x27, x28, [x2,#112]
 
         ldp     x19, x20, [x29,#16]
         ldp     x21, x22, [x29,#32]
@@ -333,250 +477,120 @@
 }
 $code.=&mul();
 
-# Computes C0-C4 = (A0-A1) * (B0-B3)
-# Inputs remain intact
-sub mul128x256_comba {
-    my ($A0,$A1,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$T0,$T1,$T2,$T3)=@_;
-    my $body=<<___;
-        mul     $T0, $A1, $B0
-        umulh   $T1, $A1, $B0
-        adds    $C0, $C0, $C2
-        adc     $C1, $C1, xzr
-
-        mul     $T2, $A0, $B2
-        umulh   $T3, $A0, $B2
-        adds    $C0, $C0, $T0
-        adcs    $C1, $C1, $T1
-        adc     $C2, xzr, xzr
-
-        mul     $T0, $A1, $B1
-        umulh   $T1, $A1, $B1
-        adds    $C1, $C1, $T2
-        adcs    $C2, $C2, $T3
-        adc     $C3, xzr, xzr
-
-        mul     $T2, $A0, $B3
-        umulh   $T3, $A0, $B3
-        adds    $C1, $C1, $T0
-        adcs    $C2, $C2, $T1
-        adc     $C3, $C3, xzr
-
-        mul     $T0, $A1, $B2
-        umulh   $T1, $A1, $B2
-        adds    $C2, $C2, $T2
-        adcs    $C3, $C3, $T3
-        adc     $C4, xzr, xzr
-
-        mul     $T2, $A1, $B3
-        umulh   $T3, $A1, $B3
-        adds    $C2, $C2, $T0
-        adcs    $C3, $C3, $T1
-        adc     $C4, $C4, xzr
-        adds    $C3, $C3, $T2
-        adc     $C4, $C4, $T3
-
-___
-    return $body;
-}
-
 #  Montgomery reduction
 #  Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015
 #  Operation: mc [x1] = ma [x0]
 #  NOTE: ma=mc is not allowed
 sub rdc {
-    my $mul01=&mul128x256_comba(
-        "x2","x3",                  # A0-A1
-        "x24","x25","x26","x27",    # B0-B3
-        "x5","x6","x7","x8","x9",   # C0-B4
-        "x1","x10","x11","x19");    # TMP
-    my $mul23=&mul128x256_comba(
-        "x2","x3",                  # A0-A1
-        "x24","x25","x26","x27",    # B0-B3
-        "x5","x6","x7","x8","x9",   # C0-C4
-        "x1","x10","x11","x19");    # TMP
-    my $mul45=&mul128x256_comba(
-        "x12","x13",                # A0-A1
-        "x24","x25","x26","x27",    # B0-B3
-        "x5","x6","x7","x8","x9",   # C0-C4
-        "x1","x10","x11","x19");    # TMP
-    my $mul67=&mul128x256_comba(
-        "x14","x15",                # A0-A1
-        "x24","x25","x26","x27",    # B0-B3
-        "x5","x6","x7","x8","x9",   # C0-C4
-        "x1","x10","x11","x19");    # TMP
+    my $mul01=&mul128x256(
+        "x2","x3",                     # A0-A1
+        "x23","x24","x25","x26",       # B0-B3
+        "x4","x5","x6","x7","x8","x9", # C0-C5
+        "x10","x11","x27","x28");      # TMP
+    my $mul23=&mul128x256(
+        "x2","x10",                    # A0-A1
+        "x23","x24","x25","x26",       # B0-B3
+        "x4","x5","x6","x7","x8","x9", # C0-C5
+        "x0","x3","x27","x28");        # TMP
+    my $mul45=&mul128x256(
+        "x11","x12",                   # A0-A1
+        "x23","x24","x25","x26",       # B0-B3
+        "x4","x5","x6","x7","x8","x9", # C0-C5
+        "x10","x3","x27","x28");       # TMP
+    my $mul67=&mul64x256(
+        "x13",                         # A0
+        "x23","x24","x25","x26",       # B0-B3
+        "x4","x5","x6","x7","x8",      # C0-C4
+        "x10","x27","x28");            # TMP
     my $body=<<___;
     .global ${PREFIX}_fprdc
     .align 4
     ${PREFIX}_fprdc:
-        stp     x29, x30, [sp, #-112]!
-        add     x29, sp, #0
+        stp     x29, x30, [sp, #-96]!
+        add     x29, sp, xzr
         stp     x19, x20, [sp,#16]
         stp     x21, x22, [sp,#32]
         stp     x23, x24, [sp,#48]
         stp     x25, x26, [sp,#64]
         stp     x27, x28, [sp,#80]
-        str     x1, [sp,#96]
 
         ldp     x2, x3, [x0,#0]       // a[0-1]
 
         // Load the prime constant
-        adrp    x23, :pg_hi21:.Lp503p1_nz_s8
-        add     x23, x23, :lo12:.Lp503p1_nz_s8
-        ldp     x24, x25, [x23, #0]
-        ldp     x26, x27, [x23, #16]
+        adrp    x26, :pg_hi21:.Lp434p1
+        add     x26, x26, :lo12:.Lp434p1
+        ldp     x23, x24, [x26, #0x0]
+        ldp     x25, x26, [x26,#0x10]
 
-        // a[0-1] x .Lp503p1_nz_s8 --> result: x4:x9
-        mul     x4, x2, x24           // a[0] x .Lp503p1_nz_s8[0]
-        umulh   x7, x2, x24
-        mul     x5, x2, x25           // a[0] x .Lp503p1_nz_s8[1]
-        umulh   x6, x2, x25
-
+        // a[0-1] * p434+1
         $mul01
 
-        ldp      x2,  x3, [x0,#16]     // a[2]
-        ldp     x12, x13, [x0,#32]
-        ldp     x14, x15, [x0,#48]
+        ldp     x10, x11, [x0, #0x18]
+        ldp     x12, x13, [x0, #0x28]
+        ldp     x14, x15, [x0, #0x38]
+        ldp     x16, x17, [x0, #0x48]
+        ldp     x19, x20, [x0, #0x58]
+        ldr     x21,      [x0, #0x68]
 
-        orr     x10, xzr, x9, lsr #8
-        lsl     x9, x9, #56
-        orr     x9, x9, x8, lsr #8
-        lsl     x8, x8, #56
-        orr     x8, x8, x7, lsr #8
-        lsl     x7, x7, #56
-        orr     x7, x7, x6, lsr #8
-        lsl     x6, x6, #56
-        orr     x6, x6, x5, lsr #8
-        lsl     x5, x5, #56
-        orr     x5, x5, x4, lsr #8
-        lsl     x4, x4, #56
+        adds     x10, x10, x4
+        adcs     x11, x11, x5
+        adcs     x12, x12, x6
+        adcs     x13, x13, x7
+        adcs     x14, x14, x8
+        adcs     x15, x15, x9
+        adcs     x22, x16, xzr
+        adcs     x17, x17, xzr
+        adcs     x19, x19, xzr
+        adcs     x20, x20, xzr
+        adc      x21, x21, xzr
 
-        adds     x3, x4,  x3          // a[3]
-        adcs    x12, x5, x12          // a[4]
-        adcs    x13, x6, x13
-        adcs    x14, x7, x14
-        adcs    x15, x8, x15
-        ldp     x16, x17, [x0,#64]
-        ldp     x28, x30, [x0,#80]
-        mul     x4,  x2, x24          // a[2] x .Lp503p1_nz_s8[0]
-        umulh   x7,  x2, x24
-        adcs    x16, x9, x16
-        adcs    x17, x10, x17
-        adcs    x28, xzr, x28
-        adcs    x30, xzr, x30
-        ldp     x20, x21, [x0,#96]
-        ldp     x22, x23, [x0,#112]
-        mul     x5,  x2, x25          // a[2] x .Lp503p1_nz_s8[1]
-        umulh   x6,  x2, x25
-        adcs    x20, xzr, x20
-        adcs    x21, xzr, x21
-        adcs    x22, xzr, x22
-        adc     x23, xzr, x23
-
-        // a[2-3] x .Lp503p1_nz_s8 --> result: x4:x9
+        ldr      x2,  [x0,#0x10]       // a[2]
+        // a[2-3] * p434+1
         $mul23
 
-        orr     x10, xzr, x9, lsr #8
-        lsl     x9, x9, #56
-        orr     x9, x9, x8, lsr #8
-        lsl     x8, x8, #56
-        orr     x8, x8, x7, lsr #8
-        lsl     x7, x7, #56
-        orr     x7, x7, x6, lsr #8
-        lsl     x6, x6, #56
-        orr     x6, x6, x5, lsr #8
-        lsl     x5, x5, #56
-        orr     x5, x5, x4, lsr #8
-        lsl     x4, x4, #56
+        adds    x12, x12, x4
+        adcs    x13, x13, x5
+        adcs    x14, x14, x6
+        adcs    x15, x15, x7
+        adcs    x16, x22, x8
+        adcs    x17, x17, x9
+        adcs    x22, x19, xzr
+        adcs    x20, x20, xzr
+        adc     x21, x21, xzr
 
-        adds    x13, x4, x13          // a[5]
-        adcs    x14, x5, x14          // a[6]
-        adcs    x15, x6, x15
-        adcs    x16, x7, x16
-        mul     x4, x12, x24          // a[4] x .Lp503p1_nz_s8[0]
-        umulh   x7, x12, x24
-        adcs    x17, x8, x17
-        adcs    x28, x9, x28
-        adcs    x30, x10, x30
-        adcs    x20, xzr, x20
-        mul     x5, x12, x25          // a[4] x .Lp503p1_nz_s8[1]
-        umulh   x6, x12, x25
-        adcs    x21, xzr, x21
-        adcs    x22, xzr, x22
-        adc     x23, xzr, x23
-
-        // a[4-5] x .Lp503p1_nz_s8 --> result: x4:x9
         $mul45
+        adds    x14, x14, x4
+        adcs    x15, x15, x5
+        adcs    x16, x16, x6
+        adcs    x17, x17, x7
+        adcs    x19, x22, x8
+        adcs    x20, x20, x9
+        adc     x22, x21, xzr
 
-        orr     x10, xzr, x9, lsr #8
-        lsl     x9, x9, #56
-        orr     x9, x9, x8, lsr #8
-        lsl     x8, x8, #56
-        orr     x8, x8, x7, lsr #8
-        lsl     x7, x7, #56
-        orr     x7, x7, x6, lsr #8
-        lsl     x6, x6, #56
-        orr     x6, x6, x5, lsr #8
-        lsl     x5, x5, #56
-        orr     x5, x5, x4, lsr #8
-        lsl     x4, x4, #56
+        stp     x14, x15, [x1, #0x0]     // C0, C1
 
-        adds    x15, x4, x15          // a[7]
-        adcs    x16, x5, x16          // a[8]
-        adcs    x17, x6, x17
-        adcs    x28, x7, x28
-        mul     x4, x14, x24          // a[6] x .Lp503p1_nz_s8[0]
-        umulh   x7, x14, x24
-        adcs    x30, x8, x30
-        adcs    x20, x9, x20
-        adcs    x21, x10, x21
-        mul     x5, x14, x25          // a[6] x .Lp503p1_nz_s8[1]
-        umulh   x6, x14, x25
-        adcs    x22, xzr, x22
-        adc     x23, xzr, x23
-
-        // a[6-7] x .Lp503p1_nz_s8 --> result: x4:x9
         $mul67
+        adds    x16, x16, x4
+        adcs    x17, x17, x5
+        adcs    x19, x19, x6
+        adcs    x20, x20, x7
+        adc     x21, x22, x8
 
-        orr     x10, xzr, x9, lsr #8
-        lsl     x9, x9, #56
-        orr     x9, x9, x8, lsr #8
-        lsl     x8, x8, #56
-        orr     x8, x8, x7, lsr #8
-        lsl     x7, x7, #56
-        orr     x7, x7, x6, lsr #8
-        lsl     x6, x6, #56
-        orr     x6, x6, x5, lsr #8
-        lsl     x5, x5, #56
-        orr     x5, x5, x4, lsr #8
-        lsl     x4, x4, #56
-
-        adds    x17, x4, x17
-        adcs    x28, x5, x28
-        ldr     x1, [sp,#96]
-        adcs    x30, x6, x30
-        adcs    x20, x7, x20
-        stp     x16, x17, [x1,#0]     // Final result
-        stp     x28, x30, [x1,#16]
-        adcs    x21, x8, x21
-        adcs    x22, x9, x22
-        adc     x23, x10, x23
-        stp     x20, x21, [x1,#32]
-        stp     x22, x23, [x1,#48]
+        str     x16,       [x1, #0x10]
+        stp     x17, x19,  [x1, #0x18]
+        stp     x20, x21,  [x1, #0x28]
 
         ldp     x19, x20, [x29,#16]
         ldp     x21, x22, [x29,#32]
         ldp     x23, x24, [x29,#48]
         ldp     x25, x26, [x29,#64]
         ldp     x27, x28, [x29,#80]
-        ldp     x29, x30, [sp],#112
+        ldp     x29, x30, [sp],#96
         ret
-
 ___
 }
-
 $code.=&rdc();
 
-
 #  Field addition
 #  Operation: c [x2] = a [x0] + b [x1]
 $code.=<<___;
@@ -588,49 +602,44 @@
 
         ldp     x3, x4,   [x0,#0]
         ldp     x5, x6,   [x0,#16]
+        ldp     x7, x8,   [x0,#32]
+        ldr     x9,       [x0,#48]
         ldp     x11, x12, [x1,#0]
         ldp     x13, x14, [x1,#16]
+        ldp     x15, x16, [x1,#32]
+        ldr     x17,      [x1,#48]
 
         // Add a + b
         adds    x3, x3, x11
         adcs    x4, x4, x12
         adcs    x5, x5, x13
         adcs    x6, x6, x14
-        ldp     x7, x8,   [x0,#32]
-        ldp     x9, x10,  [x0,#48]
-        ldp     x11, x12, [x1,#32]
-        ldp     x13, x14, [x1,#48]
-        adcs    x7, x7, x11
-        adcs    x8, x8, x12
-        adcs    x9, x9, x13
-        adc     x10, x10, x14
+        adcs    x7, x7, x15
+        adcs    x8, x8, x16
+        adc     x9, x9, x17
 
-        //  Subtract 2xp503
-        adrp    x17, :pg_hi21:.Lp503x2
-        add     x17, x17, :lo12:.Lp503x2
+        //  Subtract 2xp434
+        adrp    x17, :pg_hi21:.Lp434x2
+        add     x17, x17, :lo12:.Lp434x2
         ldp     x11, x12, [x17, #0]
         ldp     x13, x14, [x17, #16]
+        ldp     x15, x16, [x17, #32]
         subs    x3, x3, x11
         sbcs    x4, x4, x12
         sbcs    x5, x5, x12
         sbcs    x6, x6, x13
         sbcs    x7, x7, x14
-
-        ldp     x15, x16, [x17, #32]
-        ldr     x17,      [x17, #48]
         sbcs    x8, x8, x15
         sbcs    x9, x9, x16
-        sbcs    x10, x10, x17
         sbc     x0, xzr, xzr    // x0 can be reused now
 
-        // Add 2xp503 anded with the mask in x0
+        // Add 2xp434 anded with the mask in x0
         and     x11, x11, x0
         and     x12, x12, x0
         and     x13, x13, x0
         and     x14, x14, x0
         and     x15, x15, x0
         and     x16, x16, x0
-        and     x17, x17, x0
 
         adds    x3, x3, x11
         adcs    x4, x4, x12
@@ -638,17 +647,15 @@
         adcs    x6, x6, x13
         adcs    x7, x7, x14
         adcs    x8, x8, x15
-        adcs    x9, x9, x16
-        adc     x10, x10, x17
+        adc     x9, x9, x16
 
         stp     x3, x4,  [x2,#0]
         stp     x5, x6,  [x2,#16]
         stp     x7, x8,  [x2,#32]
-        stp     x9, x10, [x2,#48]
+        str     x9,      [x2,#48]
 
         ldp     x29, x30, [sp],#16
         ret
-
 ___
 
 #  Field subtraction
@@ -662,60 +669,58 @@
 
         ldp     x3, x4,   [x0,#0]
         ldp     x5, x6,   [x0,#16]
+        ldp     x7, x8,   [x0,#32]
+        ldr     x9,       [x0,#48]
         ldp     x11, x12, [x1,#0]
         ldp     x13, x14, [x1,#16]
+        ldp     x15, x16, [x1,#32]
+        ldr     x17,      [x1,#48]
 
         // Subtract a - b
         subs    x3, x3, x11
         sbcs    x4, x4, x12
         sbcs    x5, x5, x13
         sbcs    x6, x6, x14
-        ldp     x7, x8,   [x0,#32]
-        ldp     x11, x12, [x1,#32]
-        sbcs    x7, x7, x11
-        sbcs    x8, x8, x12
-        ldp     x9, x10,  [x0,#48]
-        ldp     x11, x12, [x1,#48]
-        sbcs    x9, x9, x11
-        sbcs    x10, x10, x12
-        sbc     x17, xzr, xzr
+        sbcs    x7, x7, x15
+        sbcs    x8, x8, x16
+        sbcs    x9, x9, x17
+        sbc     x0, xzr, xzr
 
-        // Add 2xp503 anded with the mask in x17
-        adrp    x16, :pg_hi21:.Lp503x2
-        add     x16, x16, :lo12:.Lp503x2
+        // Add 2xp434 anded with the mask in x0
+        adrp    x17, :pg_hi21:.Lp434x2
+        add     x17, x17, :lo12:.Lp434x2
 
         // First half
-        ldp     x11, x12, [x16, #0]
-        ldp     x13, x14, [x16, #16]
-        and     x11, x11, x17
-        and     x12, x12, x17
-        and     x13, x13, x17
+        ldp     x11, x12, [x17, #0]
+        ldp     x13, x14, [x17, #16]
+        ldp     x15, x16, [x17, #32]
+
+        // Add 2xp434 anded with the mask in x0
+        and     x11, x11, x0
+        and     x12, x12, x0
+        and     x13, x13, x0
+        and     x14, x14, x0
+        and     x15, x15, x0
+        and     x16, x16, x0
+
         adds    x3, x3, x11
         adcs    x4, x4, x12
         adcs    x5, x5, x12
         adcs    x6, x6, x13
+        adcs    x7, x7, x14
+        adcs    x8, x8, x15
+        adc     x9, x9, x16
+
         stp     x3, x4,  [x2,#0]
         stp     x5, x6,  [x2,#16]
-
-        // Second half
-        ldp     x11, x12, [x16, #32]
-        ldr     x13,      [x16, #48]
-        and     x14, x14, x17
-        and     x11, x11, x17
-        and     x12, x12, x17
-        and     x13, x13, x17
-        adcs    x7, x7, x14
-        adcs    x8, x8, x11
-        adcs    x9, x9, x12
-        adc     x10, x10, x13
         stp     x7, x8,  [x2,#32]
-        stp     x9, x10, [x2,#48]
+        str     x9,      [x2,#48]
 
         ldp     x29, x30, [sp],#16
         ret
 ___
 
-# 503-bit multiprecision addition
+# 434-bit multiprecision addition
 # Operation: c [x2] = a [x0] + b [x1]
 $code.=<<___;
     .global ${PREFIX}_mpadd_asm
@@ -726,92 +731,31 @@
 
         ldp     x3, x4,   [x0,#0]
         ldp     x5, x6,   [x0,#16]
+        ldp     x7, x8,   [x0,#32]
+        ldr     x9,       [x0,#48]
         ldp     x11, x12, [x1,#0]
         ldp     x13, x14, [x1,#16]
+        ldp     x15, x16, [x1,#32]
+        ldr     x17,      [x1,#48]
 
         adds    x3, x3, x11
         adcs    x4, x4, x12
         adcs    x5, x5, x13
         adcs    x6, x6, x14
-        ldp     x7, x8,   [x0,#32]
-        ldp     x9, x10,  [x0,#48]
-        ldp     x11, x12, [x1,#32]
-        ldp     x13, x14, [x1,#48]
-        adcs    x7, x7, x11
-        adcs    x8, x8, x12
-        adcs    x9, x9, x13
-        adc     x10, x10, x14
+        adcs    x7, x7, x15
+        adcs    x8, x8, x16
+        adc     x9, x9, x17
 
         stp     x3, x4,   [x2,#0]
         stp     x5, x6,   [x2,#16]
         stp     x7, x8,   [x2,#32]
-        stp     x9, x10,  [x2,#48]
+        str     x9,       [x2,#48]
 
         ldp     x29, x30, [sp],#16
         ret
 ___
 
-
-# 2x503-bit multiprecision addition
-# Operation: c [x2] = a [x0] + b [x1]
-$code.=<<___;
-    .global ${PREFIX}_mpadd503x2_asm
-    .align 4
-    ${PREFIX}_mpadd503x2_asm:
-        stp     x29, x30, [sp,#-16]!
-        add     x29, sp, #0
-
-        ldp     x3, x4,   [x0,#0]
-        ldp     x5, x6,   [x0,#16]
-        ldp     x11, x12, [x1,#0]
-        ldp     x13, x14, [x1,#16]
-        adds    x3, x3, x11
-        adcs    x4, x4, x12
-        adcs    x5, x5, x13
-        adcs    x6, x6, x14
-        ldp     x7, x8,   [x0,#32]
-        ldp     x9, x10,  [x0,#48]
-        ldp     x11, x12, [x1,#32]
-        ldp     x13, x14, [x1,#48]
-        adcs    x7, x7, x11
-        adcs    x8, x8, x12
-        adcs    x9, x9, x13
-        adcs    x10, x10, x14
-
-        stp     x3, x4,   [x2,#0]
-        stp     x5, x6,   [x2,#16]
-        stp     x7, x8,   [x2,#32]
-        stp     x9, x10,  [x2,#48]
-
-        ldp     x3, x4,   [x0,#64]
-        ldp     x5, x6,   [x0,#80]
-        ldp     x11, x12, [x1,#64]
-        ldp     x13, x14, [x1,#80]
-        adcs    x3, x3, x11
-        adcs    x4, x4, x12
-        adcs    x5, x5, x13
-        adcs    x6, x6, x14
-        ldp     x7, x8,   [x0,#96]
-        ldp     x9, x10,  [x0,#112]
-        ldp     x11, x12, [x1,#96]
-        ldp     x13, x14, [x1,#112]
-        adcs    x7, x7, x11
-        adcs    x8, x8, x12
-        adcs    x9, x9, x13
-        adc     x10, x10, x14
-
-        stp     x3, x4,   [x2,#64]
-        stp     x5, x6,   [x2,#80]
-        stp     x7, x8,   [x2,#96]
-        stp     x9, x10,  [x2,#112]
-
-        ldp     x29, x30, [sp],#16
-        ret
-___
-
-
-
-# 2x503-bit multiprecision subtraction
+# 2x434-bit multiprecision subtraction
 # Operation: c [x2] = a [x0] - b [x1].
 # Returns borrow mask
 $code.=<<___;
@@ -852,111 +796,114 @@
         sbcs    x5, x5, x13
         sbcs    x6, x6, x14
         ldp     x7, x8,   [x0,#96]
-        ldp     x9, x10,  [x0,#112]
         ldp     x11, x12, [x1,#96]
-        ldp     x13, x14, [x1,#112]
         sbcs    x7, x7, x11
         sbcs    x8, x8, x12
-        sbcs    x9, x9, x13
-        sbcs    x10, x10, x14
         sbc     x0, xzr, xzr
 
         stp     x3, x4,   [x2,#64]
         stp     x5, x6,   [x2,#80]
         stp     x7, x8,   [x2,#96]
-        stp     x9, x10,  [x2,#112]
 
         ldp     x29, x30, [sp],#16
         ret
 ___
 
 
-# Double 2x503-bit multiprecision subtraction
+# Double 2x434-bit multiprecision subtraction
 # Operation: c [x2] = c [x2] - a [x0] - b [x1]
 $code.=<<___;
     .global ${PREFIX}_mpdblsubx2_asm
     .align 4
     ${PREFIX}_mpdblsubx2_asm:
-        stp     x29, x30, [sp, #-64]!
+        stp     x29, x30, [sp, #-16]!
         add     x29, sp, #0
 
-        stp     x20, x21, [sp, #16]
-        stp     x22, x23, [sp, #32]
-        str     x24,      [sp, #48]
-
-        ldp     x3, x4,   [x2,#0]
+        ldp     x3, x4,   [x2, #0]
         ldp     x5, x6,   [x2,#16]
         ldp     x7, x8,   [x2,#32]
-        ldp     x9, x10,  [x2,#48]
-        ldp     x11, x12, [x2,#64]
-        ldp     x13, x14, [x2,#80]
-        ldp     x15, x16, [x2,#96]
-        ldp     x17, x24, [x2,#112]
 
-        ldp     x20, x21, [x0,#0]
-        ldp     x22, x23, [x0,#16]
-        subs    x3, x3, x20
-        sbcs    x4, x4, x21
-        sbcs    x5, x5, x22
-        sbcs    x6, x6, x23
-        ldp     x20, x21, [x0,#32]
-        ldp     x22, x23, [x0,#48]
-        sbcs    x7, x7, x20
-        sbcs    x8, x8, x21
-        sbcs    x9, x9, x22
-        sbcs    x10, x10, x23
-        ldp     x20, x21, [x0,#64]
-        ldp     x22, x23, [x0,#80]
-        sbcs    x11, x11, x20
-        sbcs    x12, x12, x21
-        sbcs    x13, x13, x22
-        sbcs    x14, x14, x23
-        ldp     x20, x21, [x0,#96]
-        ldp     x22, x23, [x0,#112]
-        sbcs    x15, x15, x20
-        sbcs    x16, x16, x21
-        sbcs    x17, x17, x22
-        sbc     x24, x24, x23
+        ldp     x11, x12, [x0, #0]
+        ldp     x13, x14, [x0,#16]
+        ldp     x15, x16, [x0,#32]
 
-        ldp     x20, x21, [x1,#0]
-        ldp     x22, x23, [x1,#16]
-        subs    x3, x3, x20
-        sbcs    x4, x4, x21
-        sbcs    x5, x5, x22
-        sbcs    x6, x6, x23
-        ldp     x20, x21, [x1,#32]
-        ldp     x22, x23, [x1,#48]
-        sbcs    x7, x7, x20
-        sbcs    x8, x8, x21
-        sbcs    x9, x9, x22
-        sbcs    x10, x10, x23
-        ldp     x20, x21, [x1,#64]
-        ldp     x22, x23, [x1,#80]
-        sbcs    x11, x11, x20
-        sbcs    x12, x12, x21
-        sbcs    x13, x13, x22
-        sbcs    x14, x14, x23
-        ldp     x20, x21, [x1,#96]
-        ldp     x22, x23, [x1,#112]
-        sbcs    x15, x15, x20
-        sbcs    x16, x16, x21
-        sbcs    x17, x17, x22
-        sbc     x24, x24, x23
+        subs    x3, x3, x11
+        sbcs    x4, x4, x12
+        sbcs    x5, x5, x13
+        sbcs    x6, x6, x14
+        sbcs    x7, x7, x15
+        sbcs    x8, x8, x16
 
-        stp     x3, x4,   [x2,#0]
+        // x9 stores carry
+        adc     x9, xzr, xzr
+
+        ldp     x11, x12, [x1, #0]
+        ldp     x13, x14, [x1,#16]
+        ldp     x15, x16, [x1,#32]
+        subs    x3, x3, x11
+        sbcs    x4, x4, x12
+        sbcs    x5, x5, x13
+        sbcs    x6, x6, x14
+        sbcs    x7, x7, x15
+        sbcs    x8, x8, x16
+        adc     x9, x9, xzr
+
+        stp     x3, x4,   [x2, #0]
         stp     x5, x6,   [x2,#16]
         stp     x7, x8,   [x2,#32]
-        stp     x9, x10,  [x2,#48]
-        stp     x11, x12, [x2,#64]
-        stp     x13, x14, [x2,#80]
-        stp     x15, x16, [x2,#96]
-        stp     x17, x24, [x2,#112]
 
-        ldp     x20, x21, [x29,#16]
-        ldp     x22, x23, [x29,#32]
-        ldr     x24,      [x29,#48]
+        ldp     x3, x4,   [x2,#48]
+        ldp     x5, x6,   [x2,#64]
+        ldp     x7, x8,   [x2,#80]
 
-        ldp     x29, x30, [sp],#64
+        ldp     x11, x12, [x0,#48]
+        ldp     x13, x14, [x0,#64]
+        ldp     x15, x16, [x0,#80]
+
+        // x9 = 2 - x9
+        neg     x9, x9
+        add     x9, x9, #2
+
+        subs    x3, x3, x9
+        sbcs    x3, x3, x11
+        sbcs    x4, x4, x12
+        sbcs    x5, x5, x13
+        sbcs    x6, x6, x14
+        sbcs    x7, x7, x15
+        sbcs    x8, x8, x16
+        adc     x9, xzr, xzr
+
+        ldp     x11, x12, [x1,#48]
+        ldp     x13, x14, [x1,#64]
+        ldp     x15, x16, [x1,#80]
+        subs    x3, x3, x11
+        sbcs    x4, x4, x12
+        sbcs    x5, x5, x13
+        sbcs    x6, x6, x14
+        sbcs    x7, x7, x15
+        sbcs    x8, x8, x16
+        adc     x9, x9, xzr
+
+        stp     x3, x4,   [x2,#48]
+        stp     x5, x6,   [x2,#64]
+        stp     x7, x8,   [x2,#80]
+
+        ldp      x3,  x4, [x2,#96]
+        ldp     x11, x12, [x0,#96]
+        ldp     x13, x14, [x1,#96]
+
+        // x9 = 2 - x9
+        neg     x9, x9
+        add     x9, x9, #2
+
+        subs    x3, x3, x9
+        sbcs    x3, x3, x11
+        sbcs    x4, x4, x12
+        subs    x3, x3, x13
+        sbc     x4, x4, x14
+        stp     x3, x4,   [x2,#96]
+
+        ldp     x29, x30, [sp],#16
         ret
 ___
 
diff --git a/third_party/sike/asm/fp-x86_64.pl b/third_party/sike/asm/fp-x86_64.pl
index c093c20..cffde1a 100755
--- a/third_party/sike/asm/fp-x86_64.pl
+++ b/third_party/sike/asm/fp-x86_64.pl
@@ -2,7 +2,7 @@
 #
 # April 2019
 #
-# Abstract: field arithmetic in x64 assembly for SIDH/p503
+# Abstract: field arithmetic in x64 assembly for SIDH/p434
 
 $flavour = shift;
 $output  = shift;
@@ -22,76 +22,341 @@
 $code.=<<___;
 .text
 
-# p503 x 2
-.Lp503x2:
+# p434 x 2
+.Lp434x2:
 .quad   0xFFFFFFFFFFFFFFFE
 .quad   0xFFFFFFFFFFFFFFFF
-.quad   0x57FFFFFFFFFFFFFF
-.quad   0x2610B7B44423CF41
-.quad   0x3737ED90F6FCFB5E
-.quad   0xC08B8D7BB4EF49A0
-.quad   0x0080CDEA83023C3C
+.quad   0xFB82ECF5C5FFFFFF
+.quad   0xF78CB8F062B15D47
+.quad   0xD9F8BFAD038A40AC
+.quad   0x0004683E4E2EE688
 
-# p503 + 1
-.Lp503p1:
-.quad   0xAC00000000000000
-.quad   0x13085BDA2211E7A0
-.quad   0x1B9BF6C87B7E7DAF
-.quad   0x6045C6BDDA77A4D0
-.quad   0x004066F541811E1E
-
-.Lp503p1_nz:
-.quad    0xAC00000000000000
-.quad    0x13085BDA2211E7A0
-.quad    0x1B9BF6C87B7E7DAF
-.quad    0x6045C6BDDA77A4D0
-.quad    0x004066F541811E1E
+# p434 + 1
+.Lp434p1:
+.quad   0xFDC1767AE3000000
+.quad   0x7BC65C783158AEA3
+.quad   0x6CFC5FD681C52056
+.quad   0x0002341F27177344
 
 .extern OPENSSL_ia32cap_P
 .hidden OPENSSL_ia32cap_P
-
 ___
 
-# Performs schoolbook multiplication of 128-bit with 320-bit
-# number. Uses MULX, ADOX, ADCX instruction.
-sub mul128x320_school {
-  my ($idxM0,$M0,$M1,$T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7,$T8,$T9)=@_;
-  my ($MUL0,$MUL8)=map("$idxM0+$_(%$M0)", (0,8));
+# Jump to alternative implemenatation provided as an
+# argument in case CPU supports ADOX/ADCX and MULX instructions.
+sub alt_impl {
+  $jmp_func = shift;
+
+  $body=<<___;
+  lea OPENSSL_ia32cap_P(%rip), %rcx
+  mov 8(%rcx), %rcx
+  and \$0x80100, %ecx
+  cmp \$0x80100, %ecx
+  je  $jmp_func
+
+___
+  return $body
+}
+
+# Performs schoolbook multiplication of 2 192-bit numbers. Uses
+# MULX instruction. Result is stored in 192 bits pointed by $DST.
+sub mul192 {
+  my ($idxM0,$M0,$idxM1,$M1,$idxDST,$DST,$T0,$T1,$T2,$T3,$T4,$T5,$T6)=@_;
+  my ($ML0,$ML8,$ML16)=map("$idxM0+$_($M0)",(0,8,16));
+  my ($MR0,$MR8,$MR16)=map("$idxM1+$_($M1)",(0,8,16));
+  my ($D0,$D1,$D2,$D3,$D4,$D5)=map("$idxDST+$_($DST)",(0,8,16,24,32,40));
+
+  $body=<<___;
+  mov    $ML0, %rdx
+  mulx   $MR0, $T1, $T0   # T0:T1 = A0*B0
+  mov    $T1, $D0         # DST0
+  mulx   $MR8, $T2, $T1   # T1:T2 = A0*B1
+  xor    %rax, %rax
+  adox   $T2, $T0
+  mulx   $MR16,$T3, $T2   # T2:T3 = A0*B2
+  adox   $T3, $T1
+
+  mov    $ML8, %rdx
+  mulx   $MR0, $T4, $T3   # T3:T4 = A1*B0
+  adox   %rax, $T2
+  xor    %rax, %rax
+
+  mulx   $MR8, $T6, $T5   # T6:T7 = A1*B1
+  adox   $T0, $T4
+  mov    $T4, $D1         # DST1
+  adcx   $T6, $T3
+
+  mulx   $MR16,$T0, $T6   # T6:T0 = A1*B2
+  adox   $T1, $T3
+  adcx   $T0, $T5
+  adcx   %rax, $T6
+  adox   $T2, $T5
+
+  mov    $ML16,%rdx
+  mulx   $MR0, $T0, $T1   # T1:T0 = A2*B0
+  adox   %rax, $T6
+  xor    %rax, %rax
+
+  mulx   $MR8, $T2, $T4   # T4:T2 = A2*B1
+  adox   $T3, $T0
+  mov    $T0, $D2         # DST2
+  adcx   $T5, $T1
+
+  mulx   $MR16,$T3, $T0   # T0:T3 = A2*B2
+  adcx   $T6, $T4
+  adcx   %rax, $T0
+  adox   $T2, $T1
+  adox   $T4, $T3
+  adox   %rax, $T0
+  mov    $T1, $D3          # DST3
+  mov    $T3, $D4          # DST4
+  mov    $T0, $D5          # DST5
+
+___
+  return $body;
+}
+
+# Performs schoolbook multiplication of 2 256-bit numbers. Uses
+# MULX instruction. Result is stored in 256 bits pointed by $DST.
+sub mul256 {
+  my ($idxM0,$M0,$idxM1,$M1,$idxDST,$DST,$T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7,$T8,$T9)=@_;
+  my ($ML0,$ML8,$ML16,$ML24)=map("$idxM0+$_($M0)",(0,8,16,24));
+  my ($MR0,$MR8,$MR16,$MR24)=map("$idxM1+$_($M1)",(0,8,16,24));
+  my ($D0,$D1,$D2,$D3,$D4,$D5,$D6,$D7)=map("$idxDST+$_($DST)",(0,8,16,24,32,40,48,56));
+
+  $body=<<___;
+  mov    $ML0, %rdx
+  mulx   $MR0, $T1, $T0   # T0:T1 = A0*B0
+  mov    $T1, $D0         # DST0_final
+  mulx   $MR8, $T2, $T1   # T1:T2 = A0*B1
+  xor    %rax, %rax
+  adox   $T2, $T0
+  mulx   $MR16,$T3, $T2   # T2:T3 = A0*B2
+  adox   $T3, $T1
+  mulx   $MR24,$T4, $T3   # T3:T4 = A0*B3
+  adox   $T4, $T2
+
+  mov    $ML8, %rdx
+  mulx   $MR0, $T4, $T5   # T5:T4 = A1*B0
+  adox   %rax, $T3
+  xor    %rax, %rax
+  mulx   $MR8, $T7, $T6   # T6:T7 = A1*B1
+  adox   $T0, $T4
+  mov    $T4, $D1         # DST1_final
+  adcx   $T7, $T5
+  mulx   $MR16,$T8, $T7   # T7:T8 = A1*B2
+  adcx   $T8, $T6
+  adox   $T1, $T5
+  mulx   $MR24,$T9, $T8   # T8:T9 = A1*B3
+  adcx   $T9, $T7
+  adcx   %rax, $T8
+  adox   $T2, $T6
+
+  mov    $ML16,%rdx
+  mulx   $MR0, $T0, $T1   # T1:T0 = A2*B0
+  adox   $T3, $T7
+  adox   %rax, $T8
+  xor    %rax, %rax
+  mulx   $MR8, $T3, $T2   # T2:T3 = A2*B1
+  adox   $T5, $T0
+  mov    $T0, $D2         # DST2_final
+  adcx   $T3, $T1
+  mulx   $MR16,$T4, $T3   # T3:T4 = A2*B2
+  adcx   $T4, $T2
+  adox   $T6, $T1
+  mulx   $MR24,$T9, $T4   # T3:T4 = A2*B3
+  adcx   $T9, $T3
+  adcx   %rax, $T4
+
+  adox   $T7, $T2
+  adox   $T8, $T3
+  adox   %rax, $T4
+
+  mov    $ML24,%rdx
+  mulx   $MR0,  $T0, $T5   # T5:T0 = A3*B0
+  xor    %rax,  %rax
+  mulx   $MR8,  $T7, $T6   # T6:T7 = A3*B1
+  adcx   $T7,  $T5
+  adox   $T0,  $T1
+  mulx   $MR16, $T8, $T7   # T7:T8 = A3*B2
+  adcx   $T8,  $T6
+  adox   $T5,  $T2
+  mulx   $MR24, $T9, $T8   # T8:T9 = A3*B3
+  adcx   $T9,  $T7
+  adcx   %rax,  $T8
+  adox   $T6,  $T3
+  adox   $T7,  $T4
+  adox   %rax,  $T8
+  mov    $T1,  $D3          # DST3_final
+  mov    $T2,  $D4          # DST4_final
+  mov    $T3,  $D5          # DST5_final
+  mov    $T4,  $D6          # DST6_final
+  mov    $T8,  $D7          # DST7_final
+
+___
+  return $body;
+}
+
+# Performs schoolbook multiplication of 64-bit with 256-bit
+# number.
+sub mul64x256 {
+  my ($idxM0,$M0,$M1,$T0,$T1,$T2,$T3,$T4,$T5)=@_;
   my $body.=<<___;
+    mov   $idxM0($M0), $T5
+
+    xor   $T2, $T2
+    mov   0+$M1, %rax
+    mul   $T5
+    mov   %rax, $T0   # C0
+    mov   %rdx, $T1
+
+    xor   $T3, $T3
+    mov   8+$M1, %rax
+    mul   $T5
+    add   %rax, $T1   # C1
+    adc   %rdx, $T2
+
+    xor   $T4, $T4
+    mov   16+$M1, %rax
+    mul   $T5
+    add   %rax, $T2   # C2
+    adc   %rdx, $T3
+
+    mov   24+$M1, %rax
+    mul   $T5
+    add   %rax, $T3   # C3
+    adc   %rdx, $T4   # C4
+___
+  return $body;
+}
+
+# Performs schoolbook multiplication of 64-bit with 256-bit
+# number. Uses MULX and ADOX instructions.
+sub mulx64x256 {
+  my ($idxM0,$M0,$M1,$T0,$T1,$T2,$T3,$T4,$T5)=@_;
+  my $body.=<<___;
+    xor    %rax, %rax
+    mov    $idxM0($M0), %rdx
+    mulx   0+$M1, $T0, $T1    # T0 <- C0
+    mulx   8+$M1, $T4, $T2
+    mulx   16+$M1, $T5, $T3
+
+    adox   $T4, $T1         # T1 <- C1
+    adox   $T5, $T2         # T2 <- C2
+
+    mulx   24+$M1, $T5, $T4
+    adox   $T5, $T3         # T3 <- C3
+    adox   %rax, $T4         # T4 <- C4
+___
+  return $body;
+}
+
+# Performs schoolbook multiplication of 128-bit with 256-bit
+# number. Destroys RAX and RDX
+sub mul128x256 {
+  my ($idxMA,$MA,$MB,$C0,$C1,$C2,$C3,$C4,$C5,$T0,$T1)=@_;
+  my ($MA0,$MA8)=map("$idxMA+$_($MA)", (0,8));
+  my $body.=<<___;
+    # A0 x B0
+    mov   $MA0, $T0
+    mov   0+$MB, %rax
+    mul   $T0
+    xor   $C2, $C2
+    mov   %rax, $C0   # c0
+    mov   %rdx, $C1
+
+    # A0 x B1
+    mov   8+$MB, %rax
+    mul   $T0
+    xor   $C3, $C3
+    add   %rax, $C1
+    adc   %rdx, $C2
+
+    # A1 x B0
+    mov   $MA8, $T1
+    mov   0+$MB, %rax
+    mul   $T1
+    add   %rax, $C1
+    adc   %rdx, $C2
+    adc   \$0x0, $C3
+
+    # A0 x B2
+    xor   $C4, $C4
+    mov   16+$MB, %rax
+    mul   $T0
+    add   %rax, $C2
+    adc   %rdx, $C3
+    adc   \$0x0, $C4
+
+    # A1 x B1
+    mov   8+$MB, %rax
+    mul   $T1
+    add   %rax, $C2           # c2
+    adc   %rdx, $C3
+    adc   \$0x0, $C4
+
+    # A0 x B3
+    mov   24+$MB, %rax
+    mul   $T0
+    xor   $C5, $C5
+    add   %rax, $C3
+    adc   %rdx, $C4
+    adc   \$0x0, $C5
+
+    # A1 x B2
+    mov   16+$MB, %rax
+    mul   $T1
+    add   %rax, $C3          # c3
+    adc   %rdx, $C4
+    adc   \$0x0, $C5
+
+    # A1 x B3
+    mov   24+$MB, %rax
+    mul   $T1
+    add   %rax, $C4
+    adc   %rdx, $C5
+
+___
+  return $body;
+}
+
+# Performs schoolbook multiplication of 128-bit with 256-bit
+# number. Uses MULX, ADOX, ADCX instruction.
+sub mulx128x256 {
+  my ($idxM0,$M0,$M1,$T0,$T1,$T2,$T3,$T4,$T5,$T6)=@_;
+  my ($MUL0,$MUL8)=map("$idxM0+$_($M0)", (0,8));
+  my $body.=<<___;
+    xor     %rax, %rax
     mov    $MUL0, %rdx
-    mulx   0+$M1, %$T0, %$T1       # T0 <- C0_final
-    mulx   8+$M1, %$T4, %$T2
+    mulx   0+$M1, $T0, $T1        # T0 <- C0
+    mulx   8+$M1, $T4, $T2
+    mulx   16+$M1, $T5, $T3
+
+    adox   $T4, $T1               # T1: interm1
+    adox   $T5, $T2               # T2: interm2
+
+    mulx   24+$M1, $T5, $T4
+    adox   $T5, $T3               # T3: interm3
+    adox   %rax, $T4              # T4: interm4
 
     xor    %rax, %rax
-    mulx   16+$M1, %$T5, %$T3
-    adox   %$T4, %$T1
-    adox   %$T5, %$T2
-    mulx   24+$M1, %$T7, %$T4
-    adox   %$T7, %$T3
-    mulx   32+$M1, %$T6, %$T5
-    adox   %$T6, %$T4
-    adox   %rax, %$T5
-
     mov    $MUL8, %rdx
-    mulx   0+$M1, %$T6, %$T7
-    adcx   %$T6, %$T1               # T1 <- C1_final
-    adcx   %$T7, %$T2
-    mulx   8+$M1, %$T8, %$T6
-    adcx   %$T6, %$T3
-    mulx   16+$M1, %$T7, %$T9
-    adcx   %$T9, %$T4
-    mulx   24+$M1, %$T9, %$T6
-    adcx   %$T6, %$T5
-    mulx   32+$M1, %rdx, %$T6
-    adcx   %rax, %$T6
+    mulx   0+$M1, $T5, $T6
+    adcx   $T5, $T1               # T1 <- C1
+    adcx   $T6, $T2
 
-    xor    %rax, %rax
-    adox   %$T8, %$T2
-    adox   %$T7, %$T3
-    adox   %$T9, %$T4
-    adox   %rdx, %$T5
-    adox   %rax, %$T6
+    mulx   8+$M1, $T6, $T5
+    adcx   $T5, $T3
+    adox   $T6, $T2               # T2 <- C2
 
+    mulx   16+$M1, $T6, $T5
+    adcx   $T5, $T4
+    adox   $T6, $T3               # T3 <- C3
+
+    mulx   24+$M1, $T6, $T5
+    adcx   %rax, $T5
+    adox   $T6, $T4               # T4 <- C4
+    adox   %rax, $T5              # T5 <- C5
 ___
   return $body;
 }
@@ -112,87 +377,72 @@
   push %r14
 .cfi_adjust_cfa_offset  8
 .cfi_offset r14, -32
-  push %r15
-.cfi_adjust_cfa_offset  8
-.cfi_offset r15, -40
 
-  xor  %rax, %rax
+  xor   %rax, %rax
 
-  mov  0x0(%rdi),  %r8
-  mov  0x8(%rdi),  %r9
-  mov 0x10(%rdi), %r10
-  mov 0x18(%rdi), %r11
-  mov 0x20(%rdi), %r12
-  mov 0x28(%rdi), %r13
-  mov 0x30(%rdi), %r14
-  mov 0x38(%rdi), %r15
+  mov    0x0(%rdi),  %r8
+  add    0x0(%rsi),  %r8
+  mov    0x8(%rdi),  %r9
+  adc    0x8(%rsi),  %r9
+  mov   0x10(%rdi), %r10
+  adc   0x10(%rsi), %r10
+  mov   0x18(%rdi), %r11
+  adc   0x18(%rsi), %r11
+  mov   0x20(%rdi), %r12
+  adc   0x20(%rsi), %r12
+  mov   0x28(%rdi), %r13
+  adc   0x28(%rsi), %r13
+  mov   0x30(%rdi), %r14
+  adc   0x30(%rsi), %r14
 
-  add  0x0(%rsi),  %r8
-  adc  0x8(%rsi),  %r9
-  adc 0x10(%rsi), %r10
-  adc 0x18(%rsi), %r11
-  adc 0x20(%rsi), %r12
-  adc 0x28(%rsi), %r13
-  adc 0x30(%rsi), %r14
-  adc 0x38(%rsi), %r15
+  mov        .Lp434x2(%rip), %rcx
+  sub   %rcx, %r8
+  mov    0x8+.Lp434x2(%rip), %rcx
+  sbb   %rcx, %r9
+  sbb   %rcx, %r10
+  mov   0x10+.Lp434x2(%rip), %rcx
+  sbb   %rcx, %r11
+  mov   0x18+.Lp434x2(%rip), %rcx
+  sbb   %rcx, %r12
+  mov   0x20+.Lp434x2(%rip), %rcx
+  sbb   %rcx, %r13
+  mov   0x28+.Lp434x2(%rip), %rcx
+  sbb   %rcx, %r14
 
-  mov .Lp503x2(%rip), %rcx;
-  sub %rcx, %r8
-  mov 8+.Lp503x2(%rip), %rcx;
-  sbb %rcx, %r9
-  sbb %rcx, %r10
-  mov 16+.Lp503x2(%rip), %rcx;
-  sbb %rcx, %r11
-  mov 24+.Lp503x2(%rip), %rcx;
-  sbb %rcx, %r12
-  mov 32+.Lp503x2(%rip), %rcx;
-  sbb %rcx, %r13
-  mov 40+.Lp503x2(%rip), %rcx;
-  sbb %rcx, %r14
-  mov 48+.Lp503x2(%rip), %rcx;
-  sbb %rcx, %r15
-  sbb \$0, %rax
+  sbb   \$0, %rax
 
-  mov .Lp503x2(%rip), %rdi
-  and %rax, %rdi
-  mov 8+.Lp503x2(%rip), %rsi
-  and %rax, %rsi
-  mov 16+.Lp503x2(%rip), %rcx
-  and %rax, %rcx
+  mov   .Lp434x2(%rip), %rdi
+  and   %rax, %rdi
+  mov   0x8+.Lp434x2(%rip), %rsi
+  and   %rax, %rsi
+  mov   0x10+.Lp434x2(%rip), %rcx
+  and   %rax, %rcx
 
-  add %rdi, %r8
-  mov %r8, 0x0(%rdx)
-  adc %rsi, %r9
-  mov %r9, 0x8(%rdx)
-  adc %rsi, %r10
-  mov %r10, 0x10(%rdx)
-  adc %rcx, %r11
-  mov %r11, 0x18(%rdx)
+  add   %rdi,  %r8
+  mov   %r8,   0x0(%rdx)
+  adc   %rsi,  %r9
+  mov   %r9,   0x8(%rdx)
+  adc   %rsi, %r10
+  mov   %r10, 0x10(%rdx)
+  adc   %rcx, %r11
+  mov   %r11, 0x18(%rdx)
 
-  setc   %cl
-
-  mov 24+.Lp503x2(%rip), %r8
-  and %rax, %r8
-  mov 32+.Lp503x2(%rip), %r9
-  and %rax, %r9
-  mov 40+.Lp503x2(%rip), %r10
-  and %rax, %r10
-  mov 48+.Lp503x2(%rip), %r11
-  and %rax, %r11
-
+  setc  %cl
+  mov   0x18+.Lp434x2(%rip),  %r8
+  and   %rax,  %r8
+  mov   0x20+.Lp434x2(%rip),  %r9
+  and   %rax,  %r9
+  mov   0x28+.Lp434x2(%rip), %r10
+  and   %rax, %r10
   bt    \$0, %rcx
 
-  adc  %r8, %r12
-  mov %r12, 0x20(%rdx)
-  adc  %r9, %r13
-  mov %r13, 0x28(%rdx)
-  adc %r10, %r14
-  mov %r14, 0x30(%rdx)
-  adc %r11, %r15
-  mov %r15, 0x38(%rdx)
+  adc   %r8, %r12
+  mov   %r12, 0x20(%rdx)
+  adc   %r9, %r13
+  mov   %r13, 0x28(%rdx)
+  adc   %r10, %r14
+  mov  %r14, 0x30(%rdx)
 
-  pop %r15
-.cfi_adjust_cfa_offset  -8
   pop %r14
 .cfi_adjust_cfa_offset  -8
   pop %r13
@@ -203,8 +453,6 @@
 .cfi_endproc
 ___
 
-
-
 # Loads data to XMM0 and XMM1 and
 # conditionaly swaps depending on XMM3
 sub cswap_block16() {
@@ -226,15 +474,11 @@
 # Conditionally swaps bits in x and y in constant time.
 # mask indicates bits to be swapped (set bits are swapped)
 # Operation: [rdi] <-> [rsi] if rdx==1
-sub cswap {
-  # P[0].X with Q[0].X
-  foreach ( 0.. 3){$BLOCKS.=eval "&cswap_block16($_)";}
-  # P[0].Z with Q[0].Z
-  foreach ( 4.. 7){$BLOCKS.=eval "&cswap_block16($_)";}
-  # P[1].X with Q[1].X
-  foreach ( 8..11){$BLOCKS.=eval "&cswap_block16($_)";}
-  # P[1].Z with Q[1].Z
-  foreach (12..15){$BLOCKS.=eval "&cswap_block16($_)";}
+sub sike_cswap {
+  # P[0] with Q[0]
+  foreach ( 0.. 6){$BLOCKS.=eval "&cswap_block16($_)";}
+  # P[1] with Q[1]
+  foreach ( 7..13){$BLOCKS.=eval "&cswap_block16($_)";}
 
   my $body =<<___;
 .globl  ${PREFIX}_cswap_asm
@@ -254,7 +498,8 @@
 ___
   ($body)
 }
-$code.=&cswap();
+$code.=&sike_cswap();
+
 
 # Field subtraction
 # Operation: c [rdx] = a [rdi] - b [rsi]
@@ -272,71 +517,58 @@
   push   %r14
 .cfi_adjust_cfa_offset  8
 .cfi_offset r14, -32
-  push   %r15
-.cfi_adjust_cfa_offset  8
-.cfi_offset r15, -40
 
   xor %rax, %rax
 
-  mov  0x0(%rdi), %r8
-  mov  0x8(%rdi), %r9
-  mov 0x10(%rdi), %r10
-  mov 0x18(%rdi), %r11
-  mov 0x20(%rdi), %r12
-  mov 0x28(%rdi), %r13
-  mov 0x30(%rdi), %r14
-  mov 0x38(%rdi), %r15
+  mov    0x0(%rdi),  %r8
+  sub    0x0(%rsi),  %r8
+  mov    0x8(%rdi),  %r9
+  sbb    0x8(%rsi),  %r9
+  mov   0x10(%rdi), %r10
+  sbb   0x10(%rsi), %r10
+  mov   0x18(%rdi), %r11
+  sbb   0x18(%rsi), %r11
+  mov   0x20(%rdi), %r12
+  sbb   0x20(%rsi), %r12
+  mov   0x28(%rdi), %r13
+  sbb   0x28(%rsi), %r13
+  mov   0x30(%rdi), %r14
+  sbb   0x30(%rsi), %r14
 
-  sub  0x0(%rsi), %r8
-  sbb  0x8(%rsi), %r9
-  sbb 0x10(%rsi), %r10
-  sbb 0x18(%rsi), %r11
-  sbb 0x20(%rsi), %r12
-  sbb 0x28(%rsi), %r13
-  sbb 0x30(%rsi), %r14
-  sbb 0x38(%rsi), %r15
-  sbb \$0x0, %rax
+  sbb   \$0x0, %rax
 
-  mov .Lp503x2(%rip), %rdi
-  and %rax, %rdi
-  mov 0x8+.Lp503x2(%rip), %rsi
-  and %rax, %rsi
-  mov 0x10+.Lp503x2(%rip), %rcx
-  and %rax, %rcx
+  mov   .Lp434x2(%rip), %rdi
+  and   %rax, %rdi
+  mov   0x08+.Lp434x2(%rip), %rsi
+  and   %rax, %rsi
+  mov   0x10+.Lp434x2(%rip), %rcx
+  and   %rax, %rcx
 
-  add %rdi,        %r8
-  adc %rsi,        %r9
-  adc %rsi,       %r10
-  adc %rcx,       %r11
-  mov %r8,   0x0(%rdx)
-  mov %r9,   0x8(%rdx)
-  mov %r10, 0x10(%rdx)
-  mov %r11, 0x18(%rdx)
+  add   %rdi,  %r8
+  mov   %r8,   0x0(%rdx)
+  adc   %rsi,  %r9
+  mov   %r9,   0x8(%rdx)
+  adc   %rsi, %r10
+  mov   %r10, 0x10(%rdx)
+  adc   %rcx, %r11
+  mov   %r11, 0x18(%rdx)
 
-  setc %cl
+  setc  %cl
+  mov   0x18+.Lp434x2(%rip),  %r8
+  and   %rax,  %r8
+  mov   0x20+.Lp434x2(%rip),  %r9
+  and   %rax,  %r9
+  mov   0x28+.Lp434x2(%rip), %r10
+  and   %rax, %r10
+  bt    \$0x0, %rcx
 
-  mov 0x18+.Lp503x2(%rip),  %r8
-  and %rax,  %r8
-  mov 0x20+.Lp503x2(%rip),  %r9
-  and %rax,  %r9
-  mov 0x28+.Lp503x2(%rip), %r10
-  and %rax, %r10
-  mov 0x30+.Lp503x2(%rip), %r11
-  and %rax, %r11
+  adc   %r8, %r12
+  adc   %r9, %r13
+  adc   %r10, %r14
+  mov   %r12, 0x20(%rdx)
+  mov   %r13, 0x28(%rdx)
+  mov   %r14, 0x30(%rdx)
 
-  bt \$0x0, %rcx
-
-  adc %r8, %r12
-  adc %r9, %r13
-  adc %r10, %r14
-  adc %r11, %r15
-  mov %r12, 0x20(%rdx)
-  mov %r13, 0x28(%rdx)
-  mov %r14, 0x30(%rdx)
-  mov %r15, 0x38(%rdx)
-
-  pop %r15
-.cfi_adjust_cfa_offset  -8
   pop %r14
 .cfi_adjust_cfa_offset  -8
   pop %r13
@@ -347,43 +579,40 @@
 .cfi_endproc
 ___
 
-#  503-bit multiprecision addition
+#  434-bit multiprecision addition
 #  Operation: c [rdx] = a [rdi] + b [rsi]
 $code.=<<___;
 .globl  ${PREFIX}_mpadd_asm
 .type   ${PREFIX}_mpadd_asm,\@function,3
 ${PREFIX}_mpadd_asm:
 .cfi_startproc
-  mov  0x0(%rdi), %r8
-  mov  0x8(%rdi), %r9
-  mov 0x10(%rdi), %r10
-  mov 0x18(%rdi), %r11
-  add  0x0(%rsi), %r8
-  adc  0x8(%rsi), %r9
-  adc 0x10(%rsi), %r10
-  adc 0x18(%rsi), %r11
-  mov %r8,   0x0(%rdx)
-  mov %r9,   0x8(%rdx)
-  mov %r10, 0x10(%rdx)
-  mov %r11, 0x18(%rdx)
+  mov    0x0(%rdi), %r8;
+  mov    0x8(%rdi), %r9
+  mov   0x10(%rdi), %r10
+  mov   0x18(%rdi), %r11
+  mov   0x20(%rdi), %rcx
+  add    0x0(%rsi), %r8
+  adc    0x8(%rsi), %r9
+  adc   0x10(%rsi), %r10
+  adc   0x18(%rsi), %r11
+  adc   0x20(%rsi), %rcx
+  mov   %r8,   0x0(%rdx)
+  mov   %r9,   0x8(%rdx)
+  mov   %r10, 0x10(%rdx)
+  mov   %r11, 0x18(%rdx)
+  mov   %rcx, 0x20(%rdx)
 
-  mov 0x20(%rdi), %r8
-  mov 0x28(%rdi), %r9
-  mov 0x30(%rdi), %r10
-  mov 0x38(%rdi), %r11
-  adc 0x20(%rsi), %r8
-  adc 0x28(%rsi), %r9
-  adc 0x30(%rsi), %r10
-  adc 0x38(%rsi), %r11
-  mov %r8,  0x20(%rdx)
-  mov %r9,  0x28(%rdx)
-  mov %r10, 0x30(%rdx)
-  mov %r11, 0x38(%rdx)
+  mov   0x28(%rdi), %r8
+  mov   0x30(%rdi), %r9
+  adc   0x28(%rsi), %r8
+  adc   0x30(%rsi), %r9
+  mov   %r8, 0x28(%rdx)
+  mov   %r9, 0x30(%rdx)
   ret
 .cfi_endproc
 ___
 
-#  2x503-bit multiprecision subtraction
+#  2x434-bit multiprecision subtraction
 #  Operation: c [rdx] = a [rdi] - b [rsi].
 #  Returns borrow mask
 $code.=<<___;
@@ -391,65 +620,59 @@
 .type   ${PREFIX}_mpsubx2_asm,\@function,3
 ${PREFIX}_mpsubx2_asm:
 .cfi_startproc
-  xor %rax, %rax
+  xor   %rax, %rax
 
-  mov  0x0(%rdi), %r8
-  mov  0x8(%rdi), %r9
-  mov 0x10(%rdi), %r10
-  mov 0x18(%rdi), %r11
-  mov 0x20(%rdi), %rcx
-  sub  0x0(%rsi), %r8
-  sbb  0x8(%rsi), %r9
-  sbb 0x10(%rsi), %r10
-  sbb 0x18(%rsi), %r11
-  sbb 0x20(%rsi), %rcx
-  mov %r8,   0x0(%rdx)
-  mov %r9,   0x8(%rdx)
-  mov %r10, 0x10(%rdx)
-  mov %r11, 0x18(%rdx)
-  mov %rcx, 0x20(%rdx)
+  mov    0x0(%rdi), %r8
+  mov    0x8(%rdi), %r9
+  mov   0x10(%rdi), %r10
+  mov   0x18(%rdi), %r11
+  mov   0x20(%rdi), %rcx
+  sub    0x0(%rsi), %r8
+  sbb    0x8(%rsi), %r9
+  sbb   0x10(%rsi), %r10
+  sbb   0x18(%rsi), %r11
+  sbb   0x20(%rsi), %rcx
+  mov   %r8,   0x0(%rdx)
+  mov   %r9,   0x8(%rdx)
+  mov   %r10, 0x10(%rdx)
+  mov   %r11, 0x18(%rdx)
+  mov   %rcx, 0x20(%rdx)
 
-  mov 0x28(%rdi), %r8
-  mov 0x30(%rdi), %r9
-  mov 0x38(%rdi), %r10
-  mov 0x40(%rdi), %r11
-  mov 0x48(%rdi), %rcx
-  sbb 0x28(%rsi), %r8
-  sbb 0x30(%rsi), %r9
-  sbb 0x38(%rsi), %r10
-  sbb 0x40(%rsi), %r11
-  sbb 0x48(%rsi), %rcx
-  mov %r8,  0x28(%rdx)
-  mov %r9,  0x30(%rdx)
-  mov %r10, 0x38(%rdx)
-  mov %r11, 0x40(%rdx)
-  mov %rcx, 0x48(%rdx)
+  mov   0x28(%rdi), %r8
+  mov   0x30(%rdi), %r9
+  mov   0x38(%rdi), %r10
+  mov   0x40(%rdi), %r11
+  mov   0x48(%rdi), %rcx
+  sbb   0x28(%rsi), %r8
+  sbb   0x30(%rsi), %r9
+  sbb   0x38(%rsi), %r10
+  sbb   0x40(%rsi), %r11
+  sbb   0x48(%rsi), %rcx
+  mov   %r8,  0x28(%rdx)
+  mov   %r9,  0x30(%rdx)
+  mov   %r10, 0x38(%rdx)
+  mov   %r11, 0x40(%rdx)
+  mov   %rcx, 0x48(%rdx)
 
-  mov 0x50(%rdi), %r8
-  mov 0x58(%rdi), %r9
-  mov 0x60(%rdi), %r10
-  mov 0x68(%rdi), %r11
-  mov 0x70(%rdi), %rcx
-  sbb 0x50(%rsi), %r8
-  sbb 0x58(%rsi), %r9
-  sbb 0x60(%rsi), %r10
-  sbb 0x68(%rsi), %r11
-  sbb 0x70(%rsi), %rcx
-  mov %r8,  0x50(%rdx)
-  mov %r9,  0x58(%rdx)
-  mov %r10, 0x60(%rdx)
-  mov %r11, 0x68(%rdx)
-  mov %rcx, 0x70(%rdx)
-
-  mov 0x78(%rdi), %r8
-  sbb 0x78(%rsi), %r8
-  sbb \$0x0, %rax
-  mov %r8, 0x78(%rdx)
+  mov   0x50(%rdi), %r8
+  mov   0x58(%rdi), %r9
+  mov   0x60(%rdi), %r10
+  mov   0x68(%rdi), %r11
+  sbb   0x50(%rsi), %r8
+  sbb   0x58(%rsi), %r9
+  sbb   0x60(%rsi), %r10
+  sbb   0x68(%rsi), %r11
+  sbb   \$0x0, %rax
+  mov   %r8,  0x50(%rdx)
+  mov   %r9,  0x58(%rdx)
+  mov   %r10, 0x60(%rdx)
+  mov   %r11, 0x68(%rdx)
   ret
 .cfi_endproc
 ___
 
-#  Double 2x503-bit multiprecision subtraction
+
+#  Double 2x434-bit multiprecision subtraction
 #  Operation: c [rdx] = c [rdx] - a [rdi] - b [rsi]
 $code.=<<___;
 .globl  ${PREFIX}_mpdblsubx2_asm
@@ -462,87 +685,81 @@
   push   %r13
 .cfi_adjust_cfa_offset 8
 .cfi_offset r13, -24
-  push   %r14
-.cfi_adjust_cfa_offset 8
-.cfi_offset r14, -32
 
-  xor %rax, %rax
+  xor   %rax, %rax
 
-  mov  0x0(%rdx), %r8
-  mov  0x8(%rdx), %r9
-  mov 0x10(%rdx), %r10
-  mov 0x18(%rdx), %r11
-  mov 0x20(%rdx), %r12
-  mov 0x28(%rdx), %r13
-  mov 0x30(%rdx), %r14
-  mov 0x38(%rdx), %rcx
-  sub  0x0(%rdi), %r8
-  sbb  0x8(%rdi), %r9
-  sbb 0x10(%rdi), %r10
-  sbb 0x18(%rdi), %r11
-  sbb 0x20(%rdi), %r12
-  sbb 0x28(%rdi), %r13
-  sbb 0x30(%rdi), %r14
-  sbb 0x38(%rdi), %rcx
-  adc \$0x0, %rax
+  # ci:low = c:low - a:low
+  mov    0x0(%rdx), %r8
+  mov    0x8(%rdx), %r9
+  mov   0x10(%rdx), %r10
+  mov   0x18(%rdx), %r11
+  mov   0x20(%rdx), %r12
+  mov   0x28(%rdx), %r13
+  mov   0x30(%rdx), %rcx
+  sub    0x0(%rdi), %r8
+  sbb    0x8(%rdi), %r9
+  sbb   0x10(%rdi), %r10
+  sbb   0x18(%rdi), %r11
+  sbb   0x20(%rdi), %r12
+  sbb   0x28(%rdi), %r13
+  sbb   0x30(%rdi), %rcx
+  adc   \$0x0, %rax
 
-  sub  0x0(%rsi), %r8
-  sbb  0x8(%rsi), %r9
-  sbb 0x10(%rsi), %r10
-  sbb 0x18(%rsi), %r11
-  sbb 0x20(%rsi), %r12
-  sbb 0x28(%rsi), %r13
-  sbb 0x30(%rsi), %r14
-  sbb 0x38(%rsi), %rcx
-  adc \$0x0, %rax
+  # c:low = ci:low - b:low
+  sub    0x0(%rsi), %r8
+  sbb    0x8(%rsi), %r9
+  sbb   0x10(%rsi), %r10
+  sbb   0x18(%rsi), %r11
+  sbb   0x20(%rsi), %r12
+  sbb   0x28(%rsi), %r13
+  sbb   0x30(%rsi), %rcx
+  adc   \$0x0, %rax
 
-  mov %r8,   0x0(%rdx)
-  mov %r9,   0x8(%rdx)
-  mov %r10, 0x10(%rdx)
-  mov %r11, 0x18(%rdx)
-  mov %r12, 0x20(%rdx)
-  mov %r13, 0x28(%rdx)
-  mov %r14, 0x30(%rdx)
-  mov %rcx, 0x38(%rdx)
+  # store c:low
+  mov   %r8,   0x0(%rdx)
+  mov   %r9,   0x8(%rdx)
+  mov   %r10, 0x10(%rdx)
+  mov   %r11, 0x18(%rdx)
+  mov   %r12, 0x20(%rdx)
+  mov   %r13, 0x28(%rdx)
+  mov   %rcx, 0x30(%rdx)
 
-  mov 0x40(%rdx), %r8
-  mov 0x48(%rdx), %r9
-  mov 0x50(%rdx), %r10
-  mov 0x58(%rdx), %r11
-  mov 0x60(%rdx), %r12
-  mov 0x68(%rdx), %r13
-  mov 0x70(%rdx), %r14
-  mov 0x78(%rdx), %rcx
+  # ci:high = c:high - a:high
+  mov   0x38(%rdx), %r8
+  mov   0x40(%rdx), %r9
+  mov   0x48(%rdx), %r10
+  mov   0x50(%rdx), %r11
+  mov   0x58(%rdx), %r12
+  mov   0x60(%rdx), %r13
+  mov   0x68(%rdx), %rcx
 
-  sub %rax, %r8
-  sbb 0x40(%rdi), %r8
-  sbb 0x48(%rdi), %r9
-  sbb 0x50(%rdi), %r10
-  sbb 0x58(%rdi), %r11
-  sbb 0x60(%rdi), %r12
-  sbb 0x68(%rdi), %r13
-  sbb 0x70(%rdi), %r14
-  sbb 0x78(%rdi), %rcx
-  sub 0x40(%rsi), %r8
-  sbb 0x48(%rsi), %r9
-  sbb 0x50(%rsi), %r10
-  sbb 0x58(%rsi), %r11
-  sbb 0x60(%rsi), %r12
-  sbb 0x68(%rsi), %r13
-  sbb 0x70(%rsi), %r14
-  sbb 0x78(%rsi), %rcx
+  sub   %rax, %r8
+  sbb   0x38(%rdi), %r8
+  sbb   0x40(%rdi), %r9
+  sbb   0x48(%rdi), %r10
+  sbb   0x50(%rdi), %r11
+  sbb   0x58(%rdi), %r12
+  sbb   0x60(%rdi), %r13
+  sbb   0x68(%rdi), %rcx
 
-  mov %r8,  0x40(%rdx)
-  mov %r9,  0x48(%rdx)
-  mov %r10, 0x50(%rdx)
-  mov %r11, 0x58(%rdx)
-  mov %r12, 0x60(%rdx)
-  mov %r13, 0x68(%rdx)
-  mov %r14, 0x70(%rdx)
-  mov %rcx, 0x78(%rdx)
+  # c:high = ci:high - b:high
+  sub   0x38(%rsi), %r8
+  sbb   0x40(%rsi), %r9
+  sbb   0x48(%rsi), %r10
+  sbb   0x50(%rsi), %r11
+  sbb   0x58(%rsi), %r12
+  sbb   0x60(%rsi), %r13
+  sbb   0x68(%rsi), %rcx
 
-  pop %r14
-.cfi_adjust_cfa_offset -8
+  # store c:high
+  mov   %r8,  0x38(%rdx)
+  mov   %r9,  0x40(%rdx)
+  mov   %r10, 0x48(%rdx)
+  mov   %r11, 0x50(%rdx)
+  mov   %r12, 0x58(%rdx)
+  mov   %r13, 0x60(%rdx)
+  mov   %rcx, 0x68(%rdx)
+
   pop %r13
 .cfi_adjust_cfa_offset -8
   pop %r12
@@ -552,117 +769,212 @@
 
 ___
 
-# Performs schoolbook multiplication of 2 256-bit numbers. Uses
-# MULX instruction. Result is stored in 256 bits pointed by $DST.
-sub mul256_school {
-  my ($idxM0,$M0,$idxM1,$M1,$idxDST,$DST,$T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7,$T8,$T9)=@_;
-  my ($ML0,$ML8,$ML16,$ML24)=map("$idxM0+$_(%$M0)",(0,8,16,24));
-  my ($MR0,$MR8,$MR16,$MR24)=map("$idxM1+$_(%$M1)",(0,8,16,24));
-  my ($D0,$D1,$D2,$D3,$D4,$D5,$D6,$D7)=map("$idxDST+$_(%$DST)",(0,8,16,24,32,40,48,56));
+sub redc_common {
+  my ($mul01, $mul23, $mul45, $mul67)=@_;
+  my $body=<<___;
+    $mul01
+    xor   %rcx, %rcx
+    add   0x18(%rdi), %r8
+    adc   0x20(%rdi), %r9
+    adc   0x28(%rdi), %r10
+    adc   0x30(%rdi), %r11
+    adc   0x38(%rdi), %r12
+    adc   0x40(%rdi), %r13
+    adc   0x48(%rdi), %rcx
+    mov   %r8, 0x18(%rdi)
+    mov   %r9, 0x20(%rdi)
+    mov   %r10, 0x28(%rdi)
+    mov   %r11, 0x30(%rdi)
+    mov   %r12, 0x38(%rdi)
+    mov   %r13, 0x40(%rdi)
+    mov   %rcx, 0x48(%rdi)
+    mov   0x50(%rdi), %r8
+    mov   0x58(%rdi), %r9
+    mov   0x60(%rdi), %r10
+    mov   0x68(%rdi), %r11
+    adc   \$0x0, %r8
+    adc   \$0x0, %r9
+    adc   \$0x0, %r10
+    adc   \$0x0, %r11
+    mov   %r8, 0x50(%rdi)
+    mov   %r9, 0x58(%rdi)
+    mov   %r10, 0x60(%rdi)
+    mov   %r11, 0x68(%rdi)
 
-  $body=<<___;
-  mov    $ML0, %rdx
-  mulx   $MR0, %$T1, %$T0   # T0:T1 = A0*B0
-  mov    %$T1, $D0          # DST0_final
-  mulx   $MR8, %$T2, %$T1   # T1:T2 = A0*B1
-  xor    %rax, %rax
-  adox   %$T2, %$T0
-  mulx   $MR16,%$T3, %$T2   # T2:T3 = A0*B2
-  adox   %$T3, %$T1
-  mulx   $MR24,%$T4, %$T3   # T3:T4 = A0*B3
-  adox   %$T4, %$T2
+    $mul23
+    xor   %rcx, %rcx
+    add   0x28(%rdi), %r8
+    adc   0x30(%rdi), %r9
+    adc   0x38(%rdi), %r10
+    adc   0x40(%rdi), %r11
+    adc   0x48(%rdi), %r12
+    adc   0x50(%rdi), %r13
+    adc   0x58(%rdi), %rcx
+    mov   %r8, 0x28(%rdi)
+    mov   %r9, 0x30(%rdi)
+    mov   %r10, 0x38(%rdi)
+    mov   %r11, 0x40(%rdi)
+    mov   %r12, 0x48(%rdi)
+    mov   %r13, 0x50(%rdi)
+    mov   %rcx, 0x58(%rdi)
+    mov   0x60(%rdi), %r8
+    mov   0x68(%rdi), %r9
+    adc   \$0x0, %r8
+    adc   \$0x0, %r9
+    mov   %r8, 0x60(%rdi)
+    mov   %r9, 0x68(%rdi)
 
-  mov    $ML8, %rdx
-  mulx   $MR0, %$T4, %$T5   # T5:T4 = A1*B0
-  adox   %rax, %$T3
-  xor    %rax, %rax
-  mulx   $MR8, %$T7, %$T6   # T6:T7 = A1*B1
-  adox   %$T0, %$T4
-  mov    %$T4, $D1          # DST1_final
-  adcx   %$T7, %$T5
-  mulx   $MR16,%$T8, %$T7   # T7:T8 = A1*B2
-  adcx   %$T8, %$T6
-  adox   %$T1, %$T5
-  mulx   $MR24,%$T9, %$T8   # T8:T9 = A1*B3
-  adcx   %$T9, %$T7
-  adcx   %rax, %$T8
-  adox   %$T2, %$T6
+    $mul45
+    xor   %rcx, %rcx
+    add   0x38(%rdi), %r8
+    adc   0x40(%rdi), %r9
+    adc   0x48(%rdi), %r10
+    adc   0x50(%rdi), %r11
+    adc   0x58(%rdi), %r12
+    adc   0x60(%rdi), %r13
+    adc   0x68(%rdi), %rcx
+    mov   %r8,   0x0(%rsi)    # C0
+    mov   %r9,   0x8(%rsi)    # C1
+    mov   %r10, 0x48(%rdi)
+    mov   %r11, 0x50(%rdi)
+    mov   %r12, 0x58(%rdi)
+    mov   %r13, 0x60(%rdi)
+    mov   %rcx, 0x68(%rdi)
 
-  mov    $ML16,%rdx
-  mulx   $MR0, %$T0, %$T1   # T1:T0 = A2*B0
-  adox   %$T3, %$T7
-  adox   %rax, %$T8
-  xor    %rax, %rax
-  mulx   $MR8, %$T3, %$T2   # T2:T3 = A2*B1
-  adox   %$T5, %$T0
-  mov    %$T0, $D2          # DST2_final
-  adcx   %$T3, %$T1
-  mulx   $MR16,%$T4, %$T3   # T3:T4 = A2*B2
-  adcx   %$T4, %$T2
-  adox   %$T6, %$T1
-  mulx   $MR24,%$T9, %$T4   # T3:T4 = A2*B3
-  adcx   %$T9, %$T3
-
-  adcx   %rax, %$T4
-  adox   %$T7, %$T2
-  adox   %$T8, %$T3
-  adox   %rax, %$T4
-
-  mov    $ML24, %rdx
-  mulx   $MR0,  %$T0, %$T5   # T5:T0 = A3*B0
-  xor    %rax,  %rax
-  mulx   $MR8,  %$T7, %$T6   # T6:T7 = A3*B1
-  adcx   %$T7,  %$T5
-  adox   %$T0,  %$T1
-  mulx   $MR16, %$T8, %$T7   # T7:T8 = A3*B2
-  adcx   %$T8,  %$T6
-  adox   %$T5,  %$T2
-  mulx   $MR24, %$T9, %$T8   # T8:T9 = A3*B3
-  adcx   %$T9,  %$T7
-  adcx   %rax,  %$T8
-  adox   %$T6,  %$T3
-  adox   %$T7,  %$T4
-  adox   %rax,  %$T8
-  mov    %$T1,  $D3          # DST3_final
-  mov    %$T2,  $D4          # DST4_final
-  mov    %$T3,  $D5          # DST5_final
-  mov    %$T4,  $D6          # DST6_final
-  mov    %$T8,  $D7          # DST7_final
-
+    $mul67
+    add   0x48(%rdi), %r8
+    adc   0x50(%rdi), %r9
+    adc   0x58(%rdi), %r10
+    adc   0x60(%rdi), %r11
+    adc   0x68(%rdi), %r12
+    mov   %r8,  0x10(%rsi)    # C2
+    mov   %r9,  0x18(%rsi)    # C3
+    mov   %r10, 0x20(%rsi)    # C4
+    mov   %r11, 0x28(%rsi)    # C5
+    mov   %r12, 0x30(%rsi)    # C6
 ___
   return $body;
 }
 
-# 503-bit multiplication using Karatsuba (one level),
-# schoolbook (one level).
-sub mul_mulx {
-  # [rcx+64] <- (AH+AL) x (BH+BL)
-  my $mul256_low=&mul256_school(0,"rsp",32,"rsp",64,"rcx",map("r$_",(8..15)),"rbx","rbp");
+# Optimized Montgomery reduction for CPUs, based on method described
+# in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015.
+# Operation: c [rsi] = a [rdi]
+# NOTE: a=c is not allowed
+sub sike_rdc {
+  my $jump_redc_bdw=&alt_impl(".Lrdc_bdw") if ($bmi2_adx);
+  # a[0-1] x .Lp434p1 --> result: r8:r13
+  my $mulx1=&mulx128x256( 0,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..13)),"%rcx");
+  # a[2-3] x .Lp434p1 --> result: r8:r13
+  my $mulx2=&mulx128x256(16,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..13)),"%rcx");
+  # a[4-5] x .Lp434p1 --> result: r8:r13
+  my $mulx3=&mulx128x256(32,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..13)),"%rcx");
+  # a[6-7] x .Lp434p1 --> result: r8:r13
+  my $mulx4=&mulx64x256( 48,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..13)));
+
+  # a[0-1] x .Lp434p1 --> result: r8:r13
+  my $mul1=&mul128x256( 0,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..14)),"%rcx");
+  # a[2-3] x .Lp434p1 --> result: r8:r13
+  my $mul2=&mul128x256(16,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..14)),"%rcx");
+  # a[4-5] x .Lp434p1 --> result: r8:r13
+  my $mul3=&mul128x256(32,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..14)),"%rcx");
+  # a[6-7] x .Lp434p1 --> result: r8:r13
+  my $mul4=&mul64x256( 48,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..13)));
+
+  my $redc_mul=&redc_common($mul1, $mul2, $mul3, $mul4);
+  my $redc_bdw=&redc_common($mulx1, $mulx2, $mulx3, $mulx4) if ($bmi2_adx);
+
+  # REDC for Broadwell CPUs
+  my $code=<<___;
+    .Lrdc_bdw:
+    .cfi_startproc
+      # sike_fprdc has already pushed r12--15 by this point.
+    .cfi_adjust_cfa_offset 32
+    .cfi_offset r12, -16
+    .cfi_offset r13, -24
+    .cfi_offset r14, -32
+    .cfi_offset r15, -40
+
+    $redc_bdw
+
+      pop %r15
+    .cfi_adjust_cfa_offset -8
+    .cfi_same_value r15
+      pop %r14
+    .cfi_adjust_cfa_offset -8
+    .cfi_same_value r14
+      pop %r13
+    .cfi_adjust_cfa_offset -8
+    .cfi_same_value r13
+      pop %r12
+    .cfi_adjust_cfa_offset -8
+    .cfi_same_value r12
+      ret
+    .cfi_endproc
+___
+
+  # REDC for CPUs older than Broadwell
+  $code.=<<___;
+    .globl  ${PREFIX}_fprdc
+    .type   ${PREFIX}_fprdc,\@function,3
+    ${PREFIX}_fprdc:
+    .cfi_startproc
+      push %r12
+    .cfi_adjust_cfa_offset  8
+    .cfi_offset r12, -16
+      push %r13
+    .cfi_adjust_cfa_offset  8
+    .cfi_offset r13, -24
+      push %r14
+    .cfi_adjust_cfa_offset  8
+    .cfi_offset r14, -32
+      push %r15
+    .cfi_adjust_cfa_offset  8
+    .cfi_offset r15, -40
+
+      # Jump to optimized implementation if
+      # CPU supports ADCX/ADOX/MULX
+      $jump_redc_bdw
+      # Otherwise use generic implementation
+      $redc_mul
+
+      pop %r15
+    .cfi_adjust_cfa_offset -8
+      pop %r14
+    .cfi_adjust_cfa_offset -8
+      pop %r13
+    .cfi_adjust_cfa_offset -8
+      pop %r12
+    .cfi_adjust_cfa_offset -8
+      ret
+    .cfi_endproc
+___
+  return $code;
+}
+$code.=&sike_rdc();
+
+# 434-bit multiplication using Karatsuba (one level),
+# schoolbook (one level). Uses MULX/ADOX/ADCX instructions
+# available on Broadwell micro-architectures and newer.
+sub mul_bdw {
+  # [rsp] <- (AH+AL) x (BH+BL)
+  my $mul256_low=&mul256(0,"%rsp",32,"%rsp",0,"%rsp",map("%r$_",(8..15)),"%rbx","%rbp");
   # [rcx] <- AL x BL
-  my $mul256_albl=&mul256_school(0,"rdi",0,"rsi",0,"rcx",map("r$_",(8..15)),"rbx","rbp");
-  # [rsp] <- AH x BH
-  my $mul256_ahbh=&mul256_school(32,"rdi",32,"rsi",0,"rsp",map("r$_",(8..15)),"rbx","rbp");
+  my $mul256_albl=&mul256(0,"%rdi",0,"%rsi",0,"%rcx",map("%r$_",(8..15)),"%rbx","%rbp");
+  # [rcx+64] <- AH x BH
+  my $mul192_ahbh=&mul192(32,"%rdi",32,"%rsi",64,"%rcx",map("%r$_",(8..14)));
 
   $body=<<___;
-  .Lmul_mulx:
-  .cfi_startproc
-    # sike_mpmul has already pushed r12--15 by this point.
-  .cfi_adjust_cfa_offset 32
-  .cfi_offset r12, -16
-  .cfi_offset r13, -24
-  .cfi_offset r14, -32
-  .cfi_offset r15, -40
 
-    mov %rdx, %rcx
+    mov   %rdx, %rcx
+    xor   %rax, %rax
 
     # r8-r11 <- AH + AL, rax <- mask
-    xor %rax, %rax
-    mov (%rdi), %r8
-    mov 0x8(%rdi), %r9
-    mov 0x10(%rdi), %r10
-    mov 0x18(%rdi), %r11
-    push %rbx
+    mov    0x0(%rdi), %r8
+    mov    0x8(%rdi), %r9
+    mov   0x10(%rdi), %r10
+    mov   0x18(%rdi), %r11
 
+    push %rbx
   .cfi_adjust_cfa_offset 8
   .cfi_offset rbx, -48
     push %rbp
@@ -670,131 +982,123 @@
   .cfi_adjust_cfa_offset 8
     sub \$96, %rsp
   .cfi_adjust_cfa_offset 96
-    add 0x20(%rdi), %r8
-    adc 0x28(%rdi), %r9
-    adc 0x30(%rdi), %r10
-    adc 0x38(%rdi), %r11
-    sbb \$0x0, %rax
-    mov %r8, (%rsp)
-    mov %r9, 0x8(%rsp)
-    mov %r10, 0x10(%rsp)
-    mov %r11, 0x18(%rsp)
+
+    add   0x20(%rdi), %r8
+    adc   0x28(%rdi), %r9
+    adc   0x30(%rdi), %r10
+    adc   \$0x0, %r11
+    sbb   \$0x0, %rax
+    mov   %r8,   0x0(%rsp)
+    mov   %r9,   0x8(%rsp)
+    mov   %r10, 0x10(%rsp)
+    mov   %r11, 0x18(%rsp)
 
     # r12-r15 <- BH + BL, rbx <- mask
-    xor %rbx, %rbx
-    mov (%rsi), %r12
-    mov 0x8(%rsi), %r13
-    mov 0x10(%rsi), %r14
-    mov 0x18(%rsi), %r15
-    add 0x20(%rsi), %r12
-    adc 0x28(%rsi), %r13
-    adc 0x30(%rsi), %r14
-    adc 0x38(%rsi), %r15
-    sbb \$0x0, %rbx
-    mov %r12, 0x20(%rsp)
-    mov %r13, 0x28(%rsp)
-    mov %r14, 0x30(%rsp)
-    mov %r15, 0x38(%rsp)
+    xor   %rbx, %rbx
+    mov    0x0(%rsi), %r12
+    mov    0x8(%rsi), %r13
+    mov   0x10(%rsi), %r14
+    mov   0x18(%rsi), %r15
+    add   0x20(%rsi), %r12
+    adc   0x28(%rsi), %r13
+    adc   0x30(%rsi), %r14
+    adc   \$0x0, %r15
+    sbb   \$0x0, %rbx
+    mov   %r12, 0x20(%rsp)
+    mov   %r13, 0x28(%rsp)
+    mov   %r14, 0x30(%rsp)
+    mov   %r15, 0x38(%rsp)
 
     # r12-r15 <- masked (BH + BL)
-    and %rax, %r12
-    and %rax, %r13
-    and %rax, %r14
-    and %rax, %r15
+    and   %rax, %r12
+    and   %rax, %r13
+    and   %rax, %r14
+    and   %rax, %r15
 
     # r8-r11 <- masked (AH + AL)
-    and %rbx, %r8
-    and %rbx, %r9
-    and %rbx, %r10
-    and %rbx, %r11
+    and   %rbx, %r8
+    and   %rbx, %r9
+    and   %rbx, %r10
+    and   %rbx, %r11
 
-    # r8-r11 <- masked (AH + AL) + masked (AH + AL)
-    add %r12, %r8
-    adc %r13, %r9
-    adc %r14, %r10
-    adc %r15, %r11
-    mov %r8, 0x40(%rsp)
-    mov %r9, 0x48(%rsp)
-    mov %r10, 0x50(%rsp)
-    mov %r11, 0x58(%rsp)
+    # r8-r11 <- masked (AH + AL) + masked (BH + BL)
+    add   %r12, %r8
+    adc   %r13, %r9
+    adc   %r14, %r10
+    adc   %r15, %r11
+    mov    %r8, 0x40(%rsp)
+    mov    %r9, 0x48(%rsp)
+    mov   %r10, 0x50(%rsp)
+    mov   %r11, 0x58(%rsp)
 
-    # [rcx+64] <- (AH+AL) x (BH+BL)
+    # [rsp] <- CM = (AH+AL) x (BH+BL)
     $mul256_low
-    # [rcx] <- AL x BL (Result c0-c3)
+    # [rcx] <- CL = AL x BL (Result c0-c3)
     $mul256_albl
-    # [rsp] <- AH x BH
-    $mul256_ahbh
+    # [rcx+64] <- CH = AH x BH
+    $mul192_ahbh
 
     # r8-r11 <- (AH+AL) x (BH+BL), final step
-    mov 0x40(%rsp), %r8
-    mov 0x48(%rsp), %r9
-    mov 0x50(%rsp), %r10
-    mov 0x58(%rsp), %r11
-    mov 0x60(%rcx), %rax
-    add %rax, %r8
-    mov 0x68(%rcx), %rax
-    adc %rax, %r9
-    mov 0x70(%rcx), %rax
-    adc %rax, %r10
-    mov 0x78(%rcx), %rax
-    adc %rax, %r11
+    mov   0x40(%rsp),  %r8
+    mov   0x48(%rsp),  %r9
+    mov   0x50(%rsp), %r10
+    mov   0x58(%rsp), %r11
 
-    # [rcx+64], x3-x5 <- (AH+AL) x (BH+BL) - ALxBL
-    mov 0x40(%rcx), %r12
-    mov 0x48(%rcx), %r13
-    mov 0x50(%rcx), %r14
-    mov 0x58(%rcx), %r15
-    sub (%rcx), %r12
-    sbb 0x8(%rcx), %r13
-    sbb 0x10(%rcx), %r14
-    sbb 0x18(%rcx), %r15
-    sbb 0x20(%rcx), %r8
-    sbb 0x28(%rcx), %r9
-    sbb 0x30(%rcx), %r10
-    sbb 0x38(%rcx), %r11
+    mov   0x20(%rsp), %rax
+    add   %rax, %r8
+    mov   0x28(%rsp), %rax
+    adc   %rax, %r9
+    mov   0x30(%rsp), %rax
+    adc   %rax, %r10
+    mov   0x38(%rsp), %rax
+    adc   %rax, %r11
+
+    # [rsp], x3-x5 <- (AH+AL) x (BH+BL) - ALxBL
+    mov    0x0(%rsp), %r12
+    mov    0x8(%rsp), %r13
+    mov   0x10(%rsp), %r14
+    mov   0x18(%rsp), %r15
+    sub    0x0(%rcx), %r12
+    sbb    0x8(%rcx), %r13
+    sbb   0x10(%rcx), %r14
+    sbb   0x18(%rcx), %r15
+    sbb   0x20(%rcx), %r8
+    sbb   0x28(%rcx), %r9
+    sbb   0x30(%rcx), %r10
+    sbb   0x38(%rcx), %r11
 
     # r8-r15 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
-    sub (%rsp), %r12
-    sbb 0x8(%rsp), %r13
-    sbb 0x10(%rsp), %r14
-    sbb 0x18(%rsp), %r15
-    sbb 0x20(%rsp), %r8
-    sbb 0x28(%rsp), %r9
-    sbb 0x30(%rsp), %r10
-    sbb 0x38(%rsp), %r11
+    sub   0x40(%rcx), %r12
+    sbb   0x48(%rcx), %r13
+    sbb   0x50(%rcx), %r14
+    sbb   0x58(%rcx), %r15
+    sbb   0x60(%rcx), %r8
+    sbb   0x68(%rcx), %r9
+    sbb   \$0x0, %r10
+    sbb   \$0x0, %r11
 
-    add 0x20(%rcx), %r12
-    mov %r12, 0x20(%rcx)    # Result C4-C7
-    adc 0x28(%rcx), %r13
-    mov %r13, 0x28(%rcx)
-    adc 0x30(%rcx), %r14
-    mov %r14, 0x30(%rcx)
-    adc 0x38(%rcx), %r15
-    mov %r15, 0x38(%rcx)
-    mov (%rsp), %rax
-    adc %rax, %r8           # Result C8-C15
-    mov %r8, 0x40(%rcx)
-    mov 0x8(%rsp), %rax
-    adc %rax, %r9
-    mov %r9, 0x48(%rcx)
-    mov 0x10(%rsp), %rax
-    adc %rax, %r10
-    mov %r10, 0x50(%rcx)
-    mov 0x18(%rsp), %rax
-    adc %rax, %r11
-    mov %r11, 0x58(%rcx)
-    mov 0x20(%rsp), %r12
-    adc \$0x0, %r12
-    mov %r12, 0x60(%rcx)
-    mov 0x28(%rsp), %r13
-    adc \$0x0, %r13
-    mov %r13, 0x68(%rcx)
-    mov 0x30(%rsp), %r14
-    adc \$0x0, %r14
-    mov %r14, 0x70(%rcx)
-    mov 0x38(%rsp), %r15
-    adc \$0x0, %r15
-    mov %r15, 0x78(%rcx)
+    add   0x20(%rcx), %r12
+    mov   %r12, 0x20(%rcx)    # Result C4-C7
+    adc   0x28(%rcx), %r13
+    mov   %r13, 0x28(%rcx)
+    adc   0x30(%rcx), %r14
+    mov   %r14, 0x30(%rcx)
+    adc   0x38(%rcx), %r15
+    mov   %r15, 0x38(%rcx)
+    adc   0x40(%rcx), %r8
+    mov   %r8, 0x40(%rcx)     # Result C8-C15
+    adc   0x48(%rcx), %r9
+    mov   %r9, 0x48(%rcx)
+    adc   0x50(%rcx), %r10
+    mov   %r10, 0x50(%rcx)
+    adc   0x58(%rcx), %r11
+    mov   %r11, 0x58(%rcx)
+    mov   0x60(%rcx), %r12
+    adc   \$0x0, %r12
+    mov   %r12, 0x60(%rcx)
+    mov   0x68(%rcx), %r13
+    adc   \$0x0, %r13
+    mov   %r13, 0x68(%rcx)
 
     add \$96, %rsp
   .cfi_adjust_cfa_offset -96
@@ -804,6 +1108,461 @@
     pop %rbx
   .cfi_adjust_cfa_offset -8
   .cfi_same_value rbx
+___
+  return $body;
+}
+
+# 434-bit multiplication using Karatsuba (one level),
+# schoolbook (one level).
+sub mul {
+  my $code=<<___;
+    mov %rdx, %rcx
+
+    sub \$112,  %rsp           # Allocating space in stack
+  .cfi_adjust_cfa_offset 112
+
+    # rcx[0-3] <- AH+AL
+    xor %rax, %rax
+    mov 0x20(%rdi), %r8
+    mov 0x28(%rdi), %r9
+    mov 0x30(%rdi), %r10
+    xor       %r11, %r11
+    add  0x0(%rdi), %r8
+    adc  0x8(%rdi), %r9
+    adc 0x10(%rdi), %r10
+    adc 0x18(%rdi), %r11
+    # store AH+AL mask
+    sbb  \$0,  %rax
+    mov %rax, 0x40(%rsp)
+    # store AH+AL in 0-0x18(rcx)
+    mov %r8,   0x0(%rcx)
+    mov %r9,   0x8(%rcx)
+    mov %r10, 0x10(%rcx)
+    mov %r11, 0x18(%rcx)
+
+    # r12-r15 <- BH+BL
+    xor %rdx, %rdx
+    mov 0x20(%rsi), %r12
+    mov 0x28(%rsi), %r13
+    mov 0x30(%rsi), %r14
+    xor       %r15, %r15
+    add  0x0(%rsi), %r12
+    adc  0x8(%rsi), %r13
+    adc 0x10(%rsi), %r14
+    adc 0x18(%rsi), %r15
+    sbb \$0x0, %rdx
+    # store BH+BL mask
+    mov %rdx, 0x48(%rsp)
+
+    # (rsp[0-0x38]) <- (AH+AL)*(BH+BL)
+    mov (%rcx), %rax
+    mul %r12
+    mov %rax, (%rsp)            # c0
+    mov %rdx, %r8
+
+    xor %r9,  %r9
+    mov (%rcx), %rax
+    mul %r13
+    add %rax, %r8
+    adc %rdx, %r9
+
+    xor %r10, %r10
+    mov 0x8(%rcx), %rax
+    mul %r12
+    add %rax, %r8
+    mov %r8,  0x8(%rsp)          # c1
+    adc %rdx, %r9
+    adc \$0x0,%r10
+
+    xor %r8, %r8
+    mov (%rcx), %rax
+    mul %r14
+    add %rax, %r9
+    adc %rdx, %r10
+    adc \$0x0,%r8
+
+    mov 0x10(%rcx), %rax
+    mul %r12
+    add %rax, %r9
+    adc %rdx, %r10
+    adc \$0x0,%r8
+
+    mov 0x8(%rcx), %rax
+    mul %r13
+    add %rax, %r9
+    mov %r9, 0x10(%rsp)         # c2
+    adc %rdx, %r10
+    adc \$0x0, %r8
+
+    xor %r9, %r9
+    mov (%rcx),%rax
+    mul %r15
+    add %rax, %r10
+    adc %rdx, %r8
+    adc \$0x0,%r9
+
+    mov 0x18(%rcx), %rax
+    mul %r12
+    add %rax, %r10
+    adc %rdx, %r8
+    adc \$0x0,%r9
+
+    mov 0x8(%rcx), %rax
+    mul %r14
+    add %rax, %r10
+    adc %rdx, %r8
+    adc \$0x0,%r9
+
+    mov 0x10(%rcx), %rax
+    mul %r13
+    add %rax, %r10
+    mov %r10, 0x18(%rsp)        # c3
+    adc %rdx, %r8
+    adc \$0x0, %r9
+
+    xor %r10, %r10
+    mov 0x8(%rcx), %rax
+    mul %r15
+    add %rax, %r8
+    adc %rdx, %r9
+    adc \$0x0,%r10
+
+    mov 0x18(%rcx), %rax
+    mul %r13
+    add %rax, %r8
+    adc %rdx, %r9
+    adc \$0x0,%r10
+
+    mov 0x10(%rcx), %rax
+    mul %r14
+    add %rax, %r8               # c4
+    mov  %r8, 0x20(%rsp)
+    adc %rdx, %r9
+    adc \$0x0,%r10
+
+    xor %r11, %r11
+    mov 0x10(%rcx), %rax
+    mul %r15
+    add %rax, %r9
+    adc %rdx, %r10
+    adc \$0x0,%r11
+
+    mov 0x18(%rcx), %rax
+    mul %r14
+    add %rax, %r9               # c5
+    mov  %r9, 0x28(%rsp)
+    adc %rdx, %r10
+    adc \$0x0,%r11
+
+    mov 0x18(%rcx), %rax
+    mul %r15
+    add %rax, %r10              # c6
+    mov %r10, 0x30(%rsp)
+    adc %rdx, %r11              # c7
+    mov %r11, 0x38(%rsp)
+
+    # r12-r15 <- masked (BH + BL)
+    mov 0x40(%rsp), %rax
+    and %rax, %r12
+    and %rax, %r13
+    and %rax, %r14
+    and %rax, %r15
+
+    # r8-r11 <- masked (AH + AL)
+    mov 0x48(%rsp),%rax
+    mov 0x00(rcx), %r8
+    and %rax, %r8
+    mov 0x08(rcx), %r9
+    and %rax, %r9
+    mov 0x10(rcx), %r10
+    and %rax, %r10
+    mov 0x18(rcx), %r11
+    and %rax, %r11
+
+    # r12-r15 <- masked (AH + AL) + masked (BH + BL)
+    add  %r8, %r12
+    adc  %r9, %r13
+    adc %r10, %r14
+    adc %r11, %r15
+
+    # rsp[0x20-0x38] <- (AH+AL) x (BH+BL) high
+    mov 0x20(%rsp), %rax
+    add %rax, %r12
+    mov 0x28(%rsp), %rax
+    adc %rax, %r13
+    mov 0x30(%rsp), %rax
+    adc %rax, %r14
+    mov 0x38(%rsp), %rax
+    adc %rax, %r15
+    mov %r12, 0x50(%rsp)
+    mov %r13, 0x58(%rsp)
+    mov %r14, 0x60(%rsp)
+    mov %r15, 0x68(%rsp)
+
+    # [rcx] <- CL = AL x BL
+    mov (%rdi), %r11
+    mov (%rsi), %rax
+    mul %r11
+    xor %r9,  %r9
+    mov %rax, (%rcx)              # c0
+    mov %rdx, %r8
+
+    mov 0x10(%rdi), %r14
+    mov 0x8(%rsi), %rax
+    mul %r11
+    xor %r10, %r10
+    add %rax, %r8
+    adc %rdx, %r9
+
+    mov 0x8(%rdi), %r12
+    mov (%rsi), %rax
+    mul %r12
+    add %rax, %r8
+    mov %r8,  0x8(%rcx)            # c1
+    adc %rdx, %r9
+    adc \$0x0,%r10
+
+    xor %r8,  %r8
+    mov 0x10(%rsi), %rax
+    mul %r11
+    add %rax, %r9
+    adc %rdx, %r10
+    adc \$0x0,%r8
+
+    mov (%rsi),%r13
+    mov %r14,  %rax
+    mul %r13
+    add %rax, %r9
+    adc %rdx, %r10
+    adc \$0x0,%r8
+
+    mov 0x8(%rsi), %rax
+    mul %r12
+    add %rax, %r9
+    mov %r9, 0x10(%rcx)           # c2
+    adc %rdx, %r10
+    adc \$0x0,%r8
+
+    xor %r9,  %r9
+    mov 0x18(%rsi), %rax
+    mul %r11
+    mov 0x18(%rdi), %r15
+    add %rax, %r10
+    adc %rdx, %r8
+    adc \$0x0,%r9
+
+    mov %r15, %rax
+    mul %r13
+    add %rax, %r10
+    adc %rdx, %r8
+    adc \$0x0,%r9
+
+    mov 0x10(%rsi), %rax
+    mul %r12
+    add %rax, %r10
+    adc %rdx, %r8
+    adc \$0x0,%r9
+
+    mov 0x8(%rsi), %rax
+    mul %r14
+    add %rax, %r10
+    mov %r10, 0x18(%rcx)           # c3
+    adc %rdx, %r8
+    adc \$0x0,%r9
+
+    xor %r10, %r10
+    mov 0x18(%rsi), %rax
+    mul %r12
+    add %rax, %r8
+    adc %rdx, %r9
+    adc \$0x0,%r10
+
+    mov 0x8(%rsi), %rax
+    mul %r15
+    add %rax, %r8
+    adc %rdx, %r9
+    adc \$0x0,%r10
+
+    mov 0x10(%rsi), %rax
+    mul %r14
+    add %rax, %r8
+    mov %r8,  0x20(%rcx)           # c4
+    adc %rdx, %r9
+    adc \$0x0,%r10
+
+    xor %r8, %r8
+    mov 0x18(%rsi), %rax
+    mul %r14
+    add %rax, %r9
+    adc %rdx, %r10
+    adc \$0x0,%r8
+
+    mov 0x10(%rsi), %rax
+    mul %r15
+    add %rax, %r9
+    mov %r9,  0x28(%rcx)           # c5
+    adc %rdx, %r10
+    adc \$0x0,%r8
+
+    mov 0x18(%rsi), %rax
+    mul %r15
+    add %rax, %r10
+    mov %r10, 0x30(%rcx)          # c6
+    adc %rdx, %r8
+    mov %r8,  0x38(%rcx)          # c7
+
+    # rcx[0x40-0x68] <- AH*BH
+    # multiplies 2 192-bit numbers A,B
+    mov 0x20(%rdi), %r11
+    mov 0x20(%rsi), %rax
+    mul %r11
+    xor %r9,  %r9
+    mov %rax, 0x40(%rcx)   # c0
+    mov %rdx, %r8
+
+    mov 0x30(%rdi), %r14
+    mov 0x28(%rsi), %rax
+    mul %r11
+    xor %r10, %r10
+    add %rax, %r8
+    adc %rdx, %r9
+
+    mov 0x28(%rdi), %r12
+    mov 0x20(%rsi), %rax
+    mul %r12
+    add %rax, %r8
+    mov %r8,  0x48(%rcx)    # c1
+    adc %rdx, %r9
+    adc \$0x0,%r10
+
+    xor %r8,  %r8
+    mov 0x30(%rsi), %rax
+    mul %r11
+    add %rax, %r9
+    adc %rdx, %r10
+    adc \$0x0,%r8
+
+    mov 0x20(%rsi), %r13
+    mov %r14, %rax
+    mul %r13
+    add %rax, %r9
+    adc %rdx, %r10
+    adc \$0x0,%r8
+
+    mov 0x28(%rsi), %rax
+    mul %r12
+    add %rax, %r9
+    mov %r9,  0x50(%rcx)    # c2
+    adc %rdx, %r10
+    adc \$0x0,%r8
+
+    mov 0x30(%rsi), %rax
+    mul %r12
+    xor %r12, %r12
+    add %rax, %r10
+    adc %rdx, %r8
+    adc \$0x0,%r12
+
+    mov 0x28(%rsi), %rax
+    mul %r14
+    add %rax, %r10
+    adc %rdx, %r8
+    adc \$0x0,%r12
+    mov %r10, 0x58(%rcx)   # c3
+
+    mov 0x30(%rsi), %rax
+    mul %r14
+    add %rax, %r8
+    adc \$0x0,%r12
+    mov %r8,  0x60(%rcx)    # c4
+
+    add %r12, %rdx         # c5
+
+    # [r8-r15] <- (AH+AL)x(BH+BL) - ALxBL
+    mov  0x0(%rsp), %r8
+    sub  0x0(%rcx), %r8
+    mov  0x8(%rsp), %r9
+    sbb  0x8(%rcx), %r9
+    mov 0x10(%rsp), %r10
+    sbb 0x10(%rcx), %r10
+    mov 0x18(%rsp), %r11
+    sbb 0x18(%rcx), %r11
+    mov 0x50(%rsp), %r12
+    sbb 0x20(%rcx), %r12
+    mov 0x58(%rsp), %r13
+    sbb 0x28(%rcx), %r13
+    mov 0x60(%rsp), %r14
+    sbb 0x30(%rcx), %r14
+    mov 0x68(%rsp), %r15
+    sbb 0x38(%rcx), %r15
+
+    # [r8-r15] <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
+    mov 0x40(%rcx), %rax
+    sub %rax, %r8
+    mov 0x48(%rcx), %rax
+    sbb %rax, %r9
+    mov 0x50(%rcx), %rax
+    sbb %rax, %r10
+    mov 0x58(%rcx), %rax
+    sbb %rax, %r11
+    mov 0x60(%rcx), %rax
+    sbb %rax, %r12
+    sbb %rdx, %r13
+    sbb \$0x0,%r14
+    sbb \$0x0,%r15
+
+    # Final result
+    add 0x20(%rcx), %r8
+    mov %r8, 0x20(%rcx)    # Result C4-C7
+    adc 0x28(%rcx), %r9
+    mov %r9, 0x28(%rcx)
+    adc 0x30(%rcx), %r10
+    mov %r10, 0x30(%rcx)
+    adc 0x38(%rcx), %r11
+    mov %r11, 0x38(%rcx)
+    adc 0x40(%rcx), %r12
+    mov %r12, 0x40(%rcx)   # Result C8-C13
+    adc 0x48(%rcx), %r13
+    mov %r13, 0x48(%rcx)
+    adc 0x50(%rcx), %r14
+    mov %r14, 0x50(%rcx)
+    adc 0x58(%rcx), %r15
+    mov %r15, 0x58(%rcx)
+    mov 0x60(%rcx), %r12
+    adc \$0x0, %r12
+    mov %r12, 0x60(%rcx)
+    adc \$0x0, %rdx
+    mov %rdx, 0x68(%rcx)
+
+    add \$112, %rsp        # Restoring space in stack
+  .cfi_adjust_cfa_offset -112
+___
+
+  return $code;
+}
+
+#  Integer multiplication based on Karatsuba method
+#  Operation: c [rdx] = a [rdi] * b [rsi]
+#  NOTE: a=c or b=c are not allowed
+sub sike_mul {
+  my $jump_mul_bdw=&alt_impl(".Lmul_bdw") if ($bmi2_adx);
+  # MUL for Broadwell CPUs
+  my $mul_bdw=&mul_bdw() if ($bmi2_adx);
+  # MUL for CPUs older than Broadwell
+  my $mul=&mul();
+
+  my $body=<<___;
+  .Lmul_bdw:
+  .cfi_startproc
+    # sike_mpmul has already pushed r12--15 by this point.
+  .cfi_adjust_cfa_offset 32
+  .cfi_offset r12, -16
+  .cfi_offset r13, -24
+  .cfi_offset r14, -32
+  .cfi_offset r15, -40
+
+    $mul_bdw
+
     pop %r15
   .cfi_adjust_cfa_offset -8
   .cfi_same_value r15
@@ -819,34 +1578,6 @@
       ret
   .cfi_endproc
 
-___
-  return $body;
-}
-
-# Jump to alternative implemenatation provided as an
-# argument in case CPU supports ADOX/ADCX and MULX instructions.
-sub alt_impl {
-  $jmp_func = shift;
-
-  $body=<<___;
-  lea OPENSSL_ia32cap_P(%rip), %rcx
-  mov 8(%rcx), %rcx
-  and \$0x80100, %ecx
-  cmp \$0x80100, %ecx
-  je  $jmp_func
-
-___
-  return $body
-}
-
-#  Integer multiplication based on Karatsuba method
-#  Operation: c [rdx] = a [rdi] * b [rsi]
-#  NOTE: a=c or b=c are not allowed
-sub mul {
-  my $jump_optim.=&alt_impl(".Lmul_mulx") if ($bmi2_adx);
-  my $body.=&mul_mulx() if ($bmi2_adx);
-
-  $body.=<<___;
   .globl  ${PREFIX}_mpmul
   .type   ${PREFIX}_mpmul,\@function,3
   ${PREFIX}_mpmul:
@@ -864,461 +1595,12 @@
   .cfi_adjust_cfa_offset 8
   .cfi_offset r15, -40
 
-    $jump_optim
+    # Jump to optimized implementation if
+    # CPU supports ADCX/ADOX/MULX
+    $jump_mul_bdw
+    # Otherwise use generic implementation
+    $mul
 
-    mov %rdx, %rcx
-
-    # rcx[0-3] <- AH+AL
-    xor %rax, %rax
-    mov 0x20(%rdi), %r8
-    mov 0x28(%rdi), %r9
-    mov 0x30(%rdi), %r10
-    mov 0x38(%rdi), %r11
-    add  0x0(%rdi), %r8
-    adc  0x8(%rdi), %r9
-    adc 0x10(%rdi), %r10
-    adc 0x18(%rdi), %r11
-    mov %r8,   0x0(%rcx)
-    mov %r9,   0x8(%rcx)
-    mov %r10, 0x10(%rcx)
-    mov %r11, 0x18(%rcx)
-    sbb  \$0,  %rax
-    sub \$80,  %rsp           # Allocating space in stack
-  .cfi_adjust_cfa_offset 80
-
-    # r12-r15 <- BH+BL
-    xor %rdx, %rdx
-    mov 0x20(%rsi), %r12
-    mov 0x28(%rsi), %r13
-    mov 0x30(%rsi), %r14
-    mov 0x38(%rsi), %r15
-    add  0x0(%rsi), %r12
-    adc  0x8(%rsi), %r13
-    adc 0x10(%rsi), %r14
-    adc 0x18(%rsi), %r15
-    sbb \$0x0, %rdx
-    mov %rax, 0x40(%rsp)
-    mov %rdx, 0x48(%rsp)
-
-    # (rsp[0-3],r8,r9,r10,r11) <- (AH+AL)*(BH+BL)
-    mov (%rcx), %rax
-    mul %r12
-    mov %rax, (%rsp)            # c0
-    mov %rdx, %r8
-
-    xor %r9, %r9
-    mov (%rcx), %rax
-    mul %r13
-    add %rax, %r8
-    adc %rdx, %r9
-
-    xor %r10, %r10
-    mov 0x8(%rcx), %rax
-    mul %r12
-    add %rax, %r8
-    mov %r8, 0x8(%rsp)          # c1
-    adc %rdx, %r9
-    adc \$0x0, %r10
-
-    xor %r8, %r8
-    mov (%rcx), %rax
-    mul %r14
-    add %rax, %r9
-    adc %rdx, %r10
-    adc \$0x0, %r8
-
-    mov 0x10(%rcx), %rax
-    mul %r12
-    add %rax, %r9
-    adc %rdx, %r10
-    adc \$0x0, %r8
-
-    mov 0x8(%rcx), %rax
-    mul %r13
-    add %rax, %r9
-    mov %r9, 0x10(%rsp)         # c2
-    adc %rdx, %r10
-    adc \$0x0, %r8
-
-    xor %r9, %r9
-    mov (%rcx), %rax
-    mul %r15
-    add %rax, %r10
-    adc %rdx, %r8
-    adc \$0x0, %r9
-
-    mov 0x18(%rcx), %rax
-    mul %r12
-    add %rax, %r10
-    adc %rdx, %r8
-    adc \$0x0, %r9
-
-    mov 0x8(%rcx), %rax
-    mul %r14
-    add %rax, %r10
-    adc %rdx, %r8
-    adc \$0x0, %r9
-
-    mov 0x10(%rcx), %rax
-    mul %r13
-    add %rax, %r10
-    mov %r10, 0x18(%rsp)        # c3
-    adc %rdx, %r8
-    adc \$0x0, %r9
-
-    xor %r10, %r10
-    mov 0x8(%rcx), %rax
-    mul %r15
-    add %rax, %r8
-    adc %rdx, %r9
-    adc \$0x0, %r10
-
-    mov 0x18(%rcx), %rax
-    mul %r13
-    add %rax, %r8
-    adc %rdx, %r9
-    adc \$0x0, %r10
-
-    mov 0x10(%rcx), %rax
-    mul %r14
-    add %rax, %r8
-    mov %r8, 0x20(%rsp)          # c4
-    adc %rdx, %r9
-    adc \$0x0, %r10
-
-    xor %r11, %r11
-    mov 0x10(%rcx), %rax
-    mul %r15
-    add %rax, %r9
-    adc %rdx, %r10
-    adc \$0x0, %r11
-
-    mov 0x18(%rcx), %rax
-    mul %r14
-    add %rax, %r9               # c5
-    adc %rdx, %r10
-    adc \$0x0, %r11
-
-    mov 0x18(%rcx), %rax
-    mul %r15
-    add %rax, %r10              # c6
-    adc %rdx, %r11              # c7
-
-    mov 0x40(%rsp), %rax
-    and %rax, %r12
-    and %rax, %r13
-    and %rax, %r14
-    and %rax, %r15
-    add %r8, %r12
-    adc %r9, %r13
-    adc %r10, %r14
-    adc %r11, %r15
-
-    mov 0x48(%rsp), %rax
-    mov (%rcx), %r8
-    mov 0x8(%rcx), %r9
-    mov 0x10(%rcx), %r10
-    mov 0x18(%rcx), %r11
-    and %rax, %r8
-    and %rax, %r9
-    and %rax, %r10
-    and %rax, %r11
-    add %r12, %r8
-    adc %r13, %r9
-    adc %r14, %r10
-    adc %r15, %r11
-    mov %r8, 0x20(%rsp)
-    mov %r9, 0x28(%rsp)
-    mov %r10, 0x30(%rsp)
-    mov %r11, 0x38(%rsp)
-
-    mov (%rdi), %r11
-    mov (%rsi), %rax
-    mul %r11
-    xor %r9, %r9
-    mov %rax, (%rcx)              # c0
-    mov %rdx, %r8
-
-    mov 0x10(%rdi), %r14
-    mov 0x8(%rsi), %rax
-    mul %r11
-    xor %r10, %r10
-    add %rax, %r8
-    adc %rdx, %r9
-
-    mov 0x8(%rdi), %r12
-    mov (%rsi), %rax
-    mul %r12
-    add %rax, %r8
-    mov %r8, 0x8(%rcx)            # c1
-    adc %rdx, %r9
-    adc \$0x0, %r10
-
-    xor %r8, %r8
-    mov 0x10(%rsi), %rax
-    mul %r11
-    add %rax, %r9
-    adc %rdx, %r10
-    adc \$0x0, %r8
-
-    mov (%rsi), %r13
-    mov %r14, %rax
-    mul %r13
-    add %rax, %r9
-    adc %rdx, %r10
-    adc \$0x0, %r8
-
-    mov 0x8(%rsi), %rax
-    mul %r12
-    add %rax, %r9
-    mov %r9, 0x10(%rcx)           # c2
-    adc %rdx, %r10
-    adc \$0x0, %r8
-
-    xor %r9, %r9
-    mov 0x18(%rsi), %rax
-    mul %r11
-    mov 0x18(%rdi), %r15
-    add %rax, %r10
-    adc %rdx, %r8
-    adc \$0x0, %r9
-
-    mov %r15, %rax
-    mul %r13
-    add %rax, %r10
-    adc %rdx, %r8
-    adc \$0x0, %r9
-
-    mov 0x10(%rsi), %rax
-    mul %r12
-    add %rax, %r10
-    adc %rdx, %r8
-    adc \$0x0, %r9
-
-    mov 0x8(%rsi), %rax
-    mul %r14
-    add %rax, %r10
-    mov %r10, 0x18(%rcx)           # c3
-    adc %rdx, %r8
-    adc \$0x0, %r9
-
-    xor %r10, %r10
-    mov 0x18(%rsi), %rax
-    mul %r12
-    add %rax, %r8
-    adc %rdx, %r9
-    adc \$0x0, %r10
-
-    mov 0x8(%rsi), %rax
-    mul %r15
-    add %rax, %r8
-    adc %rdx, %r9
-    adc \$0x0, %r10
-
-    mov 0x10(%rsi), %rax
-    mul %r14
-    add %rax, %r8
-    mov %r8, 0x20(%rcx)           # c4
-    adc %rdx, %r9
-    adc \$0x0, %r10
-
-    xor %r8, %r8
-    mov 0x18(%rsi), %rax
-    mul %r14
-    add %rax, %r9
-    adc %rdx, %r10
-    adc \$0x0, %r8
-
-    mov 0x10(%rsi), %rax
-    mul %r15
-    add %rax, %r9
-    mov %r9, 0x28(%rcx)           # c5
-    adc %rdx, %r10
-    adc \$0x0, %r8
-
-    mov 0x18(%rsi), %rax
-    mul %r15
-    add %rax, %r10
-    mov %r10, 0x30(%rcx)          # c6
-    adc %rdx, %r8
-    mov %r8, 0x38(%rcx)           # c7
-
-    # rcx[8-15] <- AH*BH
-    mov 0x20(%rdi), %r11
-    mov 0x20(%rsi), %rax
-    mul %r11
-    xor %r9, %r9
-    mov %rax, 0x40(%rcx)          # c0
-    mov %rdx, %r8
-
-    mov 0x30(%rdi), %r14
-    mov 0x28(%rsi), %rax
-    mul %r11
-    xor %r10, %r10
-    add %rax, %r8
-    adc %rdx, %r9
-
-    mov 0x28(%rdi), %r12
-    mov 0x20(%rsi), %rax
-    mul %r12
-    add %rax, %r8
-    mov %r8, 0x48(%rcx)           # c1
-    adc %rdx, %r9
-    adc \$0x0, %r10
-
-    xor %r8, %r8
-    mov 0x30(%rsi), %rax
-    mul %r11
-    add %rax, %r9
-    adc %rdx, %r10
-    adc \$0x0, %r8
-
-    mov 0x20(%rsi), %r13
-    mov %r14, %rax
-    mul %r13
-    add %rax, %r9
-    adc %rdx, %r10
-    adc \$0x0, %r8
-
-    mov 0x28(%rsi), %rax
-    mul %r12
-    add %rax, %r9
-    mov %r9, 0x50(%rcx)             # c2
-    adc %rdx, %r10
-    adc \$0x0, %r8
-
-    xor %r9, %r9
-    mov 0x38(%rsi), %rax
-    mul %r11
-    mov 0x38(%rdi), %r15
-    add %rax, %r10
-    adc %rdx, %r8
-    adc \$0x0, %r9
-
-    mov %r15, %rax
-    mul %r13
-    add %rax, %r10
-    adc %rdx, %r8
-    adc \$0x0, %r9
-
-    mov 0x30(%rsi), %rax
-    mul %r12
-    add %rax, %r10
-    adc %rdx, %r8
-    adc \$0x0, %r9
-
-    mov 0x28(%rsi), %rax
-    mul %r14
-    add %rax, %r10
-    mov %r10, 0x58(%rcx)            # c3
-    adc %rdx, %r8
-    adc \$0x0, %r9
-
-    xor %r10, %r10
-    mov 0x38(%rsi), %rax
-    mul %r12
-    add %rax, %r8
-    adc %rdx, %r9
-    adc \$0x0, %r10
-
-    mov 0x28(%rsi), %rax
-    mul %r15
-    add %rax, %r8
-    adc %rdx, %r9
-    adc \$0x0, %r10
-
-    mov 0x30(%rsi), %rax
-    mul %r14
-    add %rax, %r8
-    mov %r8, 0x60(%rcx)             # c4
-    adc %rdx, %r9
-    adc \$0x0, %r10
-
-    xor %r8, %r8
-    mov 0x38(%rsi), %rax
-    mul %r14
-    add %rax, %r9
-    adc %rdx, %r10
-    adc \$0x0, %r8
-
-    mov 0x30(%rsi), %rax
-    mul %r15
-    add %rax, %r9
-    mov %r9, 0x68(%rcx)             # c5
-    adc %rdx, %r10
-    adc \$0x0, %r8
-
-    mov 0x38(%rsi), %rax
-    mul %r15
-    add %rax, %r10
-    mov %r10, 0x70(%rcx)            # c6
-    adc %rdx, %r8
-    mov %r8, 0x78(%rcx)             # c7
-
-    # [r8-r15] <- (AH+AL)*(BH+BL) - AL*BL
-    mov  0x0(%rsp), %r8
-    sub  0x0(%rcx), %r8
-    mov  0x8(%rsp), %r9
-    sbb  0x8(%rcx), %r9
-    mov 0x10(%rsp), %r10
-    sbb 0x10(%rcx), %r10
-    mov 0x18(%rsp), %r11
-    sbb 0x18(%rcx), %r11
-    mov 0x20(%rsp), %r12
-    sbb 0x20(%rcx), %r12
-    mov 0x28(%rsp), %r13
-    sbb 0x28(%rcx), %r13
-    mov 0x30(%rsp), %r14
-    sbb 0x30(%rcx), %r14
-    mov 0x38(%rsp), %r15
-    sbb 0x38(%rcx), %r15
-
-    # [r8-r15] <- (AH+AL)*(BH+BL) - AL*BL - AH*BH
-    mov 0x40(%rcx), %rax
-    sub %rax, %r8
-    mov 0x48(%rcx), %rax
-    sbb %rax, %r9
-    mov 0x50(%rcx), %rax
-    sbb %rax, %r10
-    mov 0x58(%rcx), %rax
-    sbb %rax, %r11
-    mov 0x60(%rcx), %rax
-    sbb %rax, %r12
-    mov 0x68(%rcx), %rdx
-    sbb %rdx, %r13
-    mov 0x70(%rcx), %rdi
-    sbb %rdi, %r14
-    mov 0x78(%rcx), %rsi
-    sbb %rsi, %r15
-
-    # Final result
-    add 0x20(%rcx),  %r8
-    mov %r8,  0x20(%rcx)
-    adc 0x28(%rcx),  %r9
-    mov %r9,  0x28(%rcx)
-    adc 0x30(%rcx), %r10
-    mov %r10, 0x30(%rcx)
-    adc 0x38(%rcx), %r11
-    mov %r11, 0x38(%rcx)
-    adc 0x40(%rcx), %r12
-    mov %r12, 0x40(%rcx)
-    adc 0x48(%rcx), %r13
-    mov %r13, 0x48(%rcx)
-    adc 0x50(%rcx), %r14
-    mov %r14, 0x50(%rcx)
-    adc 0x58(%rcx), %r15
-    mov %r15, 0x58(%rcx)
-    adc \$0x0, %rax
-    mov %rax, 0x60(%rcx)
-    adc \$0x0, %rdx
-    mov %rdx, 0x68(%rcx)
-    adc \$0x0, %rdi
-    mov %rdi, 0x70(%rcx)
-    adc \$0x0, %rsi
-    mov %rsi, 0x78(%rcx)
-
-    add \$80, %rsp           # Restoring space in stack
-  .cfi_adjust_cfa_offset -80
     pop %r15
   .cfi_adjust_cfa_offset -8
     pop %r14
@@ -1334,513 +1616,7 @@
   return $body;
 }
 
-$code.=&mul();
-
-#  Optimized Montgomery reduction for CPUs with ADOX/ADCX and MULX
-#  Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015
-#  Operation: c [rsi] = a [rdi]
-#  NOTE: a=c is not allowed
-sub rdc_mulx {
-  # a[0-1] x .Lp503p1_nz --> result: r8:r14
-  my $mul01=&mul128x320_school(0,"rdi",".Lp503p1_nz(%rip)",map("r$_",(8..14)),"rbx","rcx","r15");
-  # a[2-3] x .Lp503p1_nz --> result: r8:r14
-  my $mul23=&mul128x320_school(16,"rdi",".Lp503p1_nz(%rip)",map("r$_",(8..14)),"rbx","rcx","r15");
-  # a[4-5] x .Lp503p1_nz --> result: r8:r14
-  my $mul45=&mul128x320_school(32,"rdi",".Lp503p1_nz(%rip)",map("r$_",(8..14)),"rbx","rcx","r15");
-  # a[6-7] x .Lp503p1_nz --> result: r8:r14
-  my $mul67=&mul128x320_school(48,"rdi",".Lp503p1_nz(%rip)",map("r$_", (8..14)),"rbx","rcx","r15");
-
-  my $body=<<___;
-    .Lrdc_mulx_asm:
-    .cfi_startproc
-      # sike_fprdc has already pushed r12--15 and rbx by this point.
-    .cfi_adjust_cfa_offset 32
-    .cfi_offset r12, -16
-    .cfi_offset r13, -24
-    .cfi_offset r14, -32
-    .cfi_offset r15, -40
-    .cfi_offset rbx, -48
-    .cfi_adjust_cfa_offset 8
-
-    $mul01
-
-    xor %r15, %r15
-    add 0x18(%rdi), %r8
-    adc 0x20(%rdi), %r9
-    adc 0x28(%rdi), %r10
-    adc 0x30(%rdi), %r11
-    adc 0x38(%rdi), %r12
-    adc 0x40(%rdi), %r13
-    adc 0x48(%rdi), %r14
-    adc 0x50(%rdi), %r15
-    mov %r8, 0x18(%rdi)
-    mov %r9, 0x20(%rdi)
-    mov %r10, 0x28(%rdi)
-    mov %r11, 0x30(%rdi)
-    mov %r12, 0x38(%rdi)
-    mov %r13, 0x40(%rdi)
-    mov %r14, 0x48(%rdi)
-    mov %r15, 0x50(%rdi)
-    mov 0x58(%rdi), %r8
-    mov 0x60(%rdi), %r9
-    mov 0x68(%rdi), %r10
-    mov 0x70(%rdi), %r11
-    mov 0x78(%rdi), %r12
-    adc \$0x0, %r8
-    adc \$0x0, %r9
-    adc \$0x0, %r10
-    adc \$0x0, %r11
-    adc \$0x0, %r12
-    mov %r8, 0x58(%rdi)
-    mov %r9, 0x60(%rdi)
-    mov %r10, 0x68(%rdi)
-    mov %r11, 0x70(%rdi)
-    mov %r12, 0x78(%rdi)
-
-    $mul23
-
-    xor %r15, %r15
-    add 0x28(%rdi), %r8
-    adc 0x30(%rdi), %r9
-    adc 0x38(%rdi), %r10
-    adc 0x40(%rdi), %r11
-    adc 0x48(%rdi), %r12
-    adc 0x50(%rdi), %r13
-    adc 0x58(%rdi), %r14
-    adc 0x60(%rdi), %r15
-    mov %r8, 0x28(%rdi)
-    mov %r9, 0x30(%rdi)
-    mov %r10, 0x38(%rdi)
-    mov %r11, 0x40(%rdi)
-    mov %r12, 0x48(%rdi)
-    mov %r13, 0x50(%rdi)
-    mov %r14, 0x58(%rdi)
-    mov %r15, 0x60(%rdi)
-    mov 0x68(%rdi), %r8
-    mov 0x70(%rdi), %r9
-    mov 0x78(%rdi), %r10
-    adc \$0x0, %r8
-    adc \$0x0, %r9
-    adc \$0x0, %r10
-    mov %r8, 0x68(%rdi)
-    mov %r9, 0x70(%rdi)
-    mov %r10, 0x78(%rdi)
-
-    $mul45
-
-    xor %r15, %r15
-    xor %rbx, %rbx
-    add 0x38(%rdi), %r8
-    adc 0x40(%rdi), %r9
-    adc 0x48(%rdi), %r10
-    adc 0x50(%rdi), %r11
-    adc 0x58(%rdi), %r12
-    adc 0x60(%rdi), %r13
-    adc 0x68(%rdi), %r14
-    adc 0x70(%rdi), %r15
-    adc 0x78(%rdi), %rbx
-    mov %r8, 0x38(%rdi)
-    mov %r9, (%rsi)         # Final result c0
-    mov %r10, 0x48(%rdi)
-    mov %r11, 0x50(%rdi)
-    mov %r12, 0x58(%rdi)
-    mov %r13, 0x60(%rdi)
-    mov %r14, 0x68(%rdi)
-    mov %r15, 0x70(%rdi)
-    mov %rbx, 0x78(%rdi)
-
-    $mul67
-
-    add 0x48(%rdi), %r8
-    adc 0x50(%rdi), %r9
-    adc 0x58(%rdi), %r10
-    adc 0x60(%rdi), %r11
-    adc 0x68(%rdi), %r12
-    adc 0x70(%rdi), %r13
-    adc 0x78(%rdi), %r14
-    mov %r8, 0x8(%rsi)
-    mov %r9, 0x10(%rsi)
-    mov %r10, 0x18(%rsi)
-    mov %r11, 0x20(%rsi)
-    mov %r12, 0x28(%rsi)
-    mov %r13, 0x30(%rsi)
-    mov %r14, 0x38(%rsi)
-
-    pop %rbx
-  .cfi_adjust_cfa_offset -8
-  .cfi_same_value rbx
-    pop %r15
-  .cfi_adjust_cfa_offset -8
-  .cfi_same_value r15
-    pop %r14
-  .cfi_adjust_cfa_offset -8
-  .cfi_same_value r14
-    pop %r13
-  .cfi_adjust_cfa_offset -8
-  .cfi_same_value r13
-    pop %r12
-  .cfi_adjust_cfa_offset -8
-  .cfi_same_value r12
-    ret
-  .cfi_endproc
-___
-  return $body;
-}
-
-#  Montgomery reduction
-#  Based on comba method
-#  Operation: c [rsi] = a [rdi]
-#  NOTE: a=c is not allowed
-sub rdc {
-  my $jump_optim=&alt_impl(".Lrdc_mulx_asm") if ($bmi2_adx);
-  my $body=&rdc_mulx() if ($bmi2_adx);
-
-  $body.=<<___;
-    .globl  ${PREFIX}_fprdc
-    .type   ${PREFIX}_fprdc,\@function,3
-    ${PREFIX}_fprdc:
-    .cfi_startproc
-      push %r12
-    .cfi_adjust_cfa_offset  8
-    .cfi_offset r12, -16
-      push %r13
-    .cfi_adjust_cfa_offset  8
-    .cfi_offset r13, -24
-      push %r14
-    .cfi_adjust_cfa_offset  8
-    .cfi_offset r14, -32
-      push %r15
-    .cfi_adjust_cfa_offset  8
-    .cfi_offset r15, -40
-      push %rbx
-    .cfi_adjust_cfa_offset  8
-    .cfi_offset rbx, -48
-
-    $jump_optim
-
-    # Reduction, generic x86 implementation
-    lea .Lp503p1(%rip), %rbx
-
-    mov (%rdi), %r11
-    mov (%rbx), %rax
-    mul %r11
-    xor %r8, %r8
-    add 0x18(%rdi), %rax
-    mov %rax, 0x18(%rsi)  # z3
-    adc %rdx, %r8
-
-    xor %r9, %r9
-    mov 0x8(%rbx), %rax
-    mul %r11
-    xor %r10, %r10
-    add %rax, %r8
-    adc %rdx, %r9
-
-    mov 0x8(%rdi), %r12
-    mov (%rbx), %rax
-    mul %r12
-    add %rax, %r8
-    adc %rdx, %r9
-    adc  \$0, %r10
-    add 0x20(%rdi), %r8
-    mov %r8, 0x20(%rsi)   # z4
-    adc  \$0, %r9
-    adc  \$0, %r10
-
-    xor %r8, %r8
-    mov 0x10(%rbx), %rax
-    mul %r11
-    add %rax, %r9
-    adc %rdx, %r10
-    adc  \$0, %r8
-
-    mov 8(%rbx), %rax
-    mul %r12
-    add %rax, %r9
-    adc %rdx, %r10
-    adc  \$0, %r8
-
-    mov 0x10(%rdi), %r13
-    mov (%rbx), %rax
-    mul %r13
-    add %rax, %r9
-    adc %rdx, %r10
-    adc  \$0, %r8
-    add 0x28(%rdi), %r9
-    mov %r9, 0x28(%rsi)   # z5
-    adc  \$0, %r10
-    adc  \$0, %r8
-
-    xor %r9, %r9
-    mov 0x18(%rbx), %rax
-    mul %r11
-    add %rax, %r10
-    adc %rdx, %r8
-    adc  \$0, %r9
-
-    mov 0x10(%rbx), %rax
-    mul %r12
-    add %rax, %r10
-    adc %rdx, %r8
-    adc  \$0, %r9
-
-    mov 0x8(%rbx), %rax
-    mul %r13
-    add %rax, %r10
-    adc %rdx, %r8
-    adc  \$0, %r9
-
-    mov 0x18(%rsi), %r14
-    mov (%rbx), %rax
-    mul %r14
-    add %rax, %r10
-    adc %rdx, %r8
-    adc  \$0, %r9
-    add 0x30(%rdi), %r10
-    mov %r10, 0x30(%rsi)    # z6
-    adc  \$0, %r8
-    adc  \$0, %r9
-
-    xor %r10, %r10
-    mov 0x20(%rbx), %rax
-    mul %r11
-    add %rax, %r8
-    adc %rdx, %r9
-    adc  \$0, %r10
-
-    mov 0x18(%rbx), %rax
-    mul %r12
-    add %rax, %r8
-    adc %rdx, %r9
-    adc  \$0, %r10
-
-    mov 0x10(%rbx), %rax
-    mul %r13
-    add %rax, %r8
-    adc %rdx, %r9
-    adc  \$0, %r10
-
-    mov 0x8(%rbx), %rax
-    mul %r14
-    add %rax, %r8
-    adc %rdx, %r9
-    adc  \$0, %r10
-
-    mov 0x20(%rsi), %r15
-    mov (%rbx), %rax
-    mul %r15
-    add %rax, %r8
-    adc %rdx, %r9
-    adc  \$0, %r10
-    add 0x38(%rdi), %r8     # Z7
-    mov %r8, 0x38(%rsi)
-    adc  \$0, %r9
-    adc  \$0, %r10
-
-    xor %r8, %r8
-    mov 0x20(%rbx), %rax
-    mul %r12
-    add %rax, %r9
-    adc %rdx, %r10
-    adc  \$0, %r8
-
-    mov 0x18(%rbx), %rax
-    mul %r13
-    add %rax, %r9
-    adc %rdx, %r10
-    adc  \$0, %r8
-
-    mov 0x10(%rbx), %rax
-    mul %r14
-    add %rax, %r9
-    adc %rdx, %r10
-    adc  \$0, %r8
-
-    mov 0x8(%rbx), %rax
-    mul %r15
-    add %rax, %r9
-    adc %rdx, %r10
-    adc  \$0, %r8
-
-    mov 0x28(%rsi), %rcx
-    mov (%rbx), %rax
-    mul %rcx
-    add %rax, %r9
-    adc %rdx, %r10
-    adc  \$0, %r8
-    add 0x40(%rdi), %r9
-    mov %r9, (%rsi)       # Z9
-    adc  \$0, %r10
-    adc  \$0, %r8
-
-    xor %r9, %r9
-    mov 0x20(%rbx), %rax
-    mul %r13
-    add %rax, %r10
-    adc %rdx, %r8
-    adc  \$0, %r9
-
-    mov 0x18(%rbx), %rax
-    mul %r14
-    add %rax, %r10
-    adc %rdx, %r8
-    adc  \$0, %r9
-
-    mov 0x10(%rbx), %rax
-    mul %r15
-    add %rax, %r10
-    adc %rdx, %r8
-    adc  \$0, %r9
-
-    mov 8(%rbx), %rax
-    mul %rcx
-    add %rax, %r10
-    adc %rdx, %r8
-    adc  \$0, %r9
-
-    mov 0x30(%rsi), %r13
-    mov (%rbx), %rax
-    mul %r13
-    add %rax, %r10
-    adc %rdx, %r8
-    adc  \$0, %r9
-    add 0x48(%rdi), %r10
-    mov %r10, 0x8(%rsi)     # Z1
-    adc  \$0, %r8
-    adc  \$0, %r9
-
-    xor %r10, %r10
-    mov 0x20(%rbx), %rax
-    mul %r14
-    add %rax, %r8
-    adc %rdx, %r9
-    adc  \$0, %r10
-
-    mov 0x18(%rbx), %rax
-    mul %r15
-    add %rax, %r8
-    adc %rdx, %r9
-    adc  \$0, %r10
-
-    mov 0x10(%rbx), %rax
-    mul %rcx
-    add %rax, %r8
-    adc %rdx, %r9
-    adc  \$0, %r10
-
-    mov 8(%rbx), %rax
-    mul %r13
-    add %rax, %r8
-    adc %rdx, %r9
-    adc  \$0, %r10
-
-    mov 0x38(%rsi), %r14
-    mov (%rbx), %rax
-    mul %r14
-    add %rax, %r8
-    adc %rdx, %r9
-    adc  \$0, %r10
-    add 0x50(%rdi), %r8
-    mov %r8, 0x10(%rsi)     # Z2
-    adc  \$0, %r9
-    adc  \$0, %r10
-
-    xor %r8, %r8
-    mov 0x20(%rbx), %rax
-    mul %r15
-    add %rax, %r9
-    adc %rdx, %r10
-    adc  \$0, %r8
-
-    mov 0x18(%rbx), %rax
-    mul %rcx
-    add %rax, %r9
-    adc %rdx, %r10
-    adc  \$0, %r8
-
-    mov 0x10(%rbx), %rax
-    mul %r13
-    add %rax, %r9
-    adc %rdx, %r10
-    adc  \$0, %r8
-
-    mov 8(%rbx), %rax
-    mul %r14
-    add %rax, %r9
-    adc %rdx, %r10
-    adc  \$0, %r8
-    add 0x58(%rdi), %r9
-    mov %r9, 0x18(%rsi)     # Z3
-    adc  \$0, %r10
-    adc  \$0, %r8
-
-    xor %r9, %r9
-    mov 0x20(%rbx), %rax
-    mul %rcx
-    add %rax, %r10
-    adc %rdx, %r8
-    adc  \$0, %r9
-
-    mov 0x18(%rbx), %rax
-    mul %r13
-    add %rax, %r10
-    adc %rdx, %r8
-    adc  \$0, %r9
-
-    mov 0x10(%rbx), %rax
-    mul %r14
-    add %rax, %r10
-    adc %rdx, %r8
-    adc  \$0, %r9
-    add 0x60(%rdi), %r10
-    mov %r10, 0x20(%rsi)    # Z4
-    adc  \$0, %r8
-    adc  \$0, %r9
-
-    xor %r10, %r10
-    mov 0x20(%rbx), %rax
-    mul %r13
-    add %rax, %r8
-    adc %rdx, %r9
-    adc  \$0, %r10
-
-    mov 0x18(%rbx), %rax
-    mul %r14
-    add %rax, %r8
-    adc %rdx, %r9
-    adc  \$0, %r10
-    add 0x68(%rdi), %r8     # Z5
-    mov %r8, 0x28(%rsi)     # Z5
-    adc  \$0, %r9
-    adc  \$0, %r10
-
-    mov 0x20(%rbx), %rax
-    mul %r14
-    add %rax, %r9
-    adc %rdx, %r10
-    add 0x70(%rdi), %r9     # Z6
-    mov %r9, 0x30(%rsi)     # Z6
-    adc  \$0, %r10
-    add 0x78(%rdi), %r10    # Z7
-    mov %r10, 0x38(%rsi)    # Z7
-
-    pop %rbx
-  .cfi_adjust_cfa_offset -8
-    pop %r15
-  .cfi_adjust_cfa_offset -8
-    pop %r14
-  .cfi_adjust_cfa_offset -8
-    pop %r13
-  .cfi_adjust_cfa_offset -8
-    pop %r12
-  .cfi_adjust_cfa_offset -8
-    ret
-  .cfi_endproc
-___
-  return $body;
-}
-
-$code.=&rdc();
+$code.=&sike_mul();
 
 foreach (split("\n",$code)) {
   s/\`([^\`]*)\`/eval($1)/ge;
diff --git a/third_party/sike/asm/fp_generic.c b/third_party/sike/asm/fp_generic.c
index 60e0da1..cdf8755 100644
--- a/third_party/sike/asm/fp_generic.c
+++ b/third_party/sike/asm/fp_generic.c
@@ -13,7 +13,7 @@
 #include "../fpx.h"
 
 // Global constants
-extern const struct params_t p503;
+extern const struct params_t params;
 
 static void digit_x_digit(const crypto_word_t a, const crypto_word_t b, crypto_word_t* c)
 { // Digit multiplication, digit * digit -> 2-digit result
@@ -50,9 +50,9 @@
 }
 
 void sike_fpadd(const felm_t a, const felm_t b, felm_t c)
-{ // Modular addition, c = a+b mod p503.
-  // Inputs: a, b in [0, 2*p503-1]
-  // Output: c in [0, 2*p503-1]
+{ // Modular addition, c = a+b mod p434.
+  // Inputs: a, b in [0, 2*p434-1]
+  // Output: c in [0, 2*p434-1]
     unsigned int i, carry = 0;
     crypto_word_t mask;
 
@@ -62,20 +62,20 @@
 
     carry = 0;
     for (i = 0; i < NWORDS_FIELD; i++) {
-        SUBC(carry, c[i], p503.prime_x2[i], carry, c[i]);
+        SUBC(carry, c[i], params.prime_x2[i], carry, c[i]);
     }
     mask = 0 - (crypto_word_t)carry;
 
     carry = 0;
     for (i = 0; i < NWORDS_FIELD; i++) {
-        ADDC(carry, c[i], p503.prime_x2[i] & mask, carry, c[i]);
+        ADDC(carry, c[i], params.prime_x2[i] & mask, carry, c[i]);
     }
 }
 
 void sike_fpsub(const felm_t a, const felm_t b, felm_t c)
-{ // Modular subtraction, c = a-b mod p503.
-  // Inputs: a, b in [0, 2*p503-1]
-  // Output: c in [0, 2*p503-1]
+{ // Modular subtraction, c = a-b mod p434.
+  // Inputs: a, b in [0, 2*p434-1]
+  // Output: c in [0, 2*p434-1]
     unsigned int i, borrow = 0;
     crypto_word_t mask;
 
@@ -86,7 +86,7 @@
 
     borrow = 0;
     for (i = 0; i < NWORDS_FIELD; i++) {
-        ADDC(borrow, c[i], p503.prime_x2[i] & mask, borrow, c[i]);
+        ADDC(borrow, c[i], params.prime_x2[i] & mask, borrow, c[i]);
     }
 }
 
@@ -124,12 +124,12 @@
     c[2*NWORDS_FIELD-1] = v;
 }
 
-void sike_fprdc(const felm_t ma, felm_t mc)
-{ // Efficient Montgomery reduction using comba and exploiting the special form of the prime p503.
-  // mc = ma*R^-1 mod p503x2, where R = 2^512.
-  // If ma < 2^512*p503, the output mc is in the range [0, 2*p503-1].
+void sike_fprdc(felm_t ma, felm_t mc)
+{ // Efficient Montgomery reduction using comba and exploiting the special form of the prime p434.
+  // mc = ma*R^-1 mod p434x2, where R = 2^448.
+  // If ma < 2^448*p434, the output mc is in the range [0, 2*p434-1].
   // ma is assumed to be in Montgomery representation.
-    unsigned int i, j, carry, count = p503_ZERO_WORDS;
+    unsigned int i, j, carry, count = ZERO_WORDS;
     crypto_word_t UV[2], t = 0, u = 0, v = 0;
 
     for (i = 0; i < NWORDS_FIELD; i++) {
@@ -138,8 +138,8 @@
 
     for (i = 0; i < NWORDS_FIELD; i++) {
         for (j = 0; j < i; j++) {
-            if (j < (i-p503_ZERO_WORDS+1)) {
-                MUL(mc[j], p503.prime_p1[i-j], UV+1, UV[0]);
+            if (j < (i-ZERO_WORDS+1)) {
+                MUL(mc[j], params.prime_p1[i-j], UV+1, UV[0]);
                 ADDC(0, UV[0], v, carry, v);
                 ADDC(carry, UV[1], u, carry, u);
                 t += carry;
@@ -160,7 +160,7 @@
         }
         for (j = i-NWORDS_FIELD+1; j < NWORDS_FIELD; j++) {
             if (j < (NWORDS_FIELD-count)) {
-                MUL(mc[j], p503.prime_p1[i-j], UV+1, UV[0]);
+                MUL(mc[j], params.prime_p1[i-j], UV+1, UV[0]);
                 ADDC(0, UV[0], v, carry, v);
                 ADDC(carry, UV[1], u, carry, u);
                 t += carry;
diff --git a/third_party/sike/fpx.c b/third_party/sike/fpx.c
index 0951418..d85875d 100644
--- a/third_party/sike/fpx.c
+++ b/third_party/sike/fpx.c
@@ -8,7 +8,7 @@
 #include "utils.h"
 #include "fpx.h"
 
-extern const struct params_t p503;
+extern const struct params_t params;
 
 // Multiprecision squaring, c = a^2 mod p.
 static void fpsqr_mont(const felm_t ma, felm_t mc)
@@ -22,101 +22,79 @@
 static void fpinv_chain_mont(felm_t a)
 {
     unsigned int i, j;
-    felm_t t[15], tt;
+    felm_t t[31], tt;
 
     // Precomputed table
     fpsqr_mont(a, tt);
     sike_fpmul_mont(a, tt, t[0]);
-    for (i = 0; i <= 13; i++) sike_fpmul_mont(t[i], tt, t[i+1]);
+    for (i = 0; i <= 29; i++) sike_fpmul_mont(t[i], tt, t[i+1]);
 
     sike_fpcopy(a, tt);
-    for (i = 0; i < 8; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(a, tt, tt);
-    for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[8], tt, tt);
-    for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[6], tt, tt);
-    for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[9], tt, tt);
     for (i = 0; i < 7; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[0], tt, tt);
-    for (i = 0; i < 7; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(a, tt, tt);
-    for (i = 0; i < 7; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[6], tt, tt);
-    for (i = 0; i < 7; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[2], tt, tt);
-    for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[8], tt, tt);
-    for (i = 0; i < 7; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(a, tt, tt);
-    for (i = 0; i < 8; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[10], tt, tt);
-    for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[0], tt, tt);
-    for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[10], tt, tt);
-    for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[10], tt, tt);
-    for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
     sike_fpmul_mont(t[5], tt, tt);
-    for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[2], tt, tt);
-    for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[6], tt, tt);
-    for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
+    for (i = 0; i < 10; i++) fpsqr_mont(tt, tt);
+    sike_fpmul_mont(t[14], tt, tt);
+    for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
     sike_fpmul_mont(t[3], tt, tt);
     for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[5], tt, tt);
-    for (i = 0; i < 12; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[12], tt, tt);
-    for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[8], tt, tt);
-    for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[6], tt, tt);
-    for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[12], tt, tt);
+    sike_fpmul_mont(t[23], tt, tt);
     for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[11], tt, tt);
-    for (i = 0; i < 8; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[6], tt, tt);
-    for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[5], tt, tt);
-    for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[14], tt, tt);
-    for (i = 0; i < 7; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[14], tt, tt);
-    for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[5], tt, tt);
-    for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[6], tt, tt);
-    for (i = 0; i < 8; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[8], tt, tt);
-    for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(a, tt, tt);
-    for (i = 0; i < 8; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[4], tt, tt);
-    for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[6], tt, tt);
-    for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[5], tt, tt);
-    for (i = 0; i < 8; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[7], tt, tt);
-    for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(a, tt, tt);
-    for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[0], tt, tt);
-    for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
-    sike_fpmul_mont(t[11], tt, tt);
-    for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
     sike_fpmul_mont(t[13], tt, tt);
+    for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
+    sike_fpmul_mont(t[24], tt, tt);
+    for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
+    sike_fpmul_mont(t[7], tt, tt);
     for (i = 0; i < 8; i++) fpsqr_mont(tt, tt);
+    sike_fpmul_mont(t[12], tt, tt);
+    for (i = 0; i < 8; i++) fpsqr_mont(tt, tt);
+    sike_fpmul_mont(t[30], tt, tt);
+    for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
     sike_fpmul_mont(t[1], tt, tt);
     for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
+    sike_fpmul_mont(t[30], tt, tt);
+    for (i = 0; i < 7; i++) fpsqr_mont(tt, tt);
+    sike_fpmul_mont(t[21], tt, tt);
+    for (i = 0; i < 9; i++) fpsqr_mont(tt, tt);
+    sike_fpmul_mont(t[2], tt, tt);
+    for (i = 0; i < 9; i++) fpsqr_mont(tt, tt);
+    sike_fpmul_mont(t[19], tt, tt);
+    for (i = 0; i < 9; i++) fpsqr_mont(tt, tt);
+    sike_fpmul_mont(t[1], tt, tt);
+    for (i = 0; i < 7; i++) fpsqr_mont(tt, tt);
+    sike_fpmul_mont(t[24], tt, tt);
+    for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
+    sike_fpmul_mont(t[26], tt, tt);
+    for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
+    sike_fpmul_mont(t[16], tt, tt);
+    for (i = 0; i < 7; i++) fpsqr_mont(tt, tt);
     sike_fpmul_mont(t[10], tt, tt);
-    for (j = 0; j < 49; j++) {
-        for (i = 0; i < 5; i++) fpsqr_mont(tt, tt);
-        sike_fpmul_mont(t[14], tt, tt);
+    for (i = 0; i < 7; i++) fpsqr_mont(tt, tt);
+    sike_fpmul_mont(t[6], tt, tt);
+    for (i = 0; i < 7; i++) fpsqr_mont(tt, tt);
+    sike_fpmul_mont(t[0], tt, tt);
+    for (i = 0; i < 9; i++) fpsqr_mont(tt, tt);
+    sike_fpmul_mont(t[20], tt, tt);
+    for (i = 0; i < 8; i++) fpsqr_mont(tt, tt);
+    sike_fpmul_mont(t[9], tt, tt);
+    for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
+    sike_fpmul_mont(t[25], tt, tt);
+    for (i = 0; i < 9; i++) fpsqr_mont(tt, tt);
+    sike_fpmul_mont(t[30], tt, tt);
+    for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
+    sike_fpmul_mont(t[26], tt, tt);
+    for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
+    sike_fpmul_mont(a, tt, tt);
+    for (i = 0; i < 7; i++) fpsqr_mont(tt, tt);
+    sike_fpmul_mont(t[28], tt, tt);
+    for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
+    sike_fpmul_mont(t[6], tt, tt);
+    for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
+    sike_fpmul_mont(t[10], tt, tt);
+    for (i = 0; i < 9; i++) fpsqr_mont(tt, tt);
+    sike_fpmul_mont(t[22], tt, tt);
+    for (j = 0; j < 35; j++) {
+        for (i = 0; i < 6; i++) fpsqr_mont(tt, tt);
+        sike_fpmul_mont(t[30], tt, tt);
     }
     sike_fpcopy(tt, a);
 }
@@ -190,7 +168,7 @@
     }
 }
 
-// Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod p503, where R=2^768
+// Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod prime, where R=2^768
 void sike_fpmul_mont(const felm_t ma, const felm_t mb, felm_t mc)
 {
     dfelm_t temp = {0};
@@ -227,7 +205,7 @@
 void sike_fpneg(felm_t a) {
   uint32_t borrow = 0;
   for (size_t i = 0; i < NWORDS_FIELD; i++) {
-    SUBC(borrow, p503.prime_x2[i], a[i], borrow, a[i]);
+    SUBC(borrow, params.prime_x2[i], a[i], borrow, a[i]);
   }
 }
 
@@ -240,7 +218,7 @@
 
   mask = 0 - (crypto_word_t)(a[0] & 1);    // If a is odd compute a+p503
   for (size_t i = 0; i < NWORDS_FIELD; i++) {
-    ADDC(carry, a[i], p503.prime[i] & mask, carry, c[i]);
+    ADDC(carry, a[i], params.prime[i] & mask, carry, c[i]);
   }
 
   // Multiprecision right shift by one.
@@ -256,13 +234,13 @@
   crypto_word_t mask;
 
   for (size_t i = 0; i < NWORDS_FIELD; i++) {
-    SUBC(borrow, a[i], p503.prime[i], borrow, a[i]);
+    SUBC(borrow, a[i], params.prime[i], borrow, a[i]);
   }
   mask = 0 - (crypto_word_t)borrow;
 
   borrow = 0;
   for (size_t i = 0; i < NWORDS_FIELD; i++) {
-    ADDC(borrow, a[i], p503.prime[i] & mask, borrow, a[i]);
+    ADDC(borrow, a[i], params.prime[i] & mask, borrow, a[i]);
   }
 }
 
@@ -283,7 +261,7 @@
     mask = mp_subfast(tt1, tt2, tt1);                  // tt1 = a0*b0 - a1*b1. If tt1 < 0 then mask = 0xFF..F, else if tt1 >= 0 then mask = 0x00..0
 
     for (size_t i = 0; i < NWORDS_FIELD; i++) {
-        t1[i] = p503.prime[i] & mask;
+        t1[i] = params.prime[i] & mask;
     }
 
     sike_fprdc(tt3, c->c1);                             // c[1] = (a0+a1)*(b0+b1) - a0*b0 - a1*b1
diff --git a/third_party/sike/fpx.h b/third_party/sike/fpx.h
index e787c28..c4c45bd 100644
--- a/third_party/sike/fpx.h
+++ b/third_party/sike/fpx.h
@@ -7,25 +7,26 @@
 extern "C" {
 #endif
 
-// Modular addition, c = a+b mod p503.
+// Modular addition, c = a+b mod p.
 void sike_fpadd(const felm_t a, const felm_t b, felm_t c);
-// Modular subtraction, c = a-b mod p503.
+// Modular subtraction, c = a-b mod p.
 void sike_fpsub(const felm_t a, const felm_t b, felm_t c);
-// Modular division by two, c = a/2 mod p503.
+// Modular division by two, c = a/2 mod p.
 void sike_fpdiv2(const felm_t a, felm_t c);
-// Modular correction to reduce field element a in [0, 2*p503-1] to [0, p503-1].
+// Modular correction to reduce field element a in [0, 2*p-1] to [0, p-1].
 void sike_fpcorrection(felm_t a);
 // Multiprecision multiply, c = a*b, where lng(a) = lng(b) = nwords.
 void sike_mpmul(const felm_t a, const felm_t b, dfelm_t c);
-// 503-bit Montgomery reduction, c = a mod p
-void sike_fprdc(const dfelm_t a, felm_t c);
-// Double 2x503-bit multiprecision subtraction, c = c-a-b
+// 443-bit Montgomery reduction, c = a mod p. Buffer 'a' is modified after
+// call returns.
+void sike_fprdc(dfelm_t a, felm_t c);
+// Double 2x443-bit multiprecision subtraction, c = c-a-b
 void sike_mpdblsubx2_asm(const felm_t a, const felm_t b, felm_t c);
 // Multiprecision subtraction, c = a-b
 crypto_word_t sike_mpsubx2_asm(const dfelm_t a, const dfelm_t b, dfelm_t c);
-// 503-bit multiprecision addition, c = a+b
+// 443-bit multiprecision addition, c = a+b
 void sike_mpadd_asm(const felm_t a, const felm_t b, felm_t c);
-// Modular negation, a = -a mod p503.
+// Modular negation, a = -a mod p.
 void sike_fpneg(felm_t a);
 // Copy of a field element, c = a
 void sike_fpcopy(const felm_t a, felm_t c);
@@ -36,11 +37,11 @@
 // Conversion from Montgomery representation to standard representation,
 // c = ma*R^(-1) mod p = a mod p, where ma in [0, p-1].
 void sike_from_mont(const felm_t ma, felm_t c);
-// Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod p503, where R=2^768
+// Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod p443, where R=2^768
 void sike_fpmul_mont(const felm_t ma, const felm_t mb, felm_t mc);
-// GF(p503^2) multiplication using Montgomery arithmetic, c = a*b in GF(p503^2)
+// GF(p443^2) multiplication using Montgomery arithmetic, c = a*b in GF(p443^2)
 void sike_fp2mul_mont(const f2elm_t a, const f2elm_t b, f2elm_t c);
-// GF(p503^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2)
+// GF(p443^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2)
 void sike_fp2inv_mont(f2elm_t a);
 // GF(p^2) squaring using Montgomery arithmetic, c = a^2 in GF(p^2).
 void sike_fp2sqr_mont(const f2elm_t a, f2elm_t c);
@@ -97,8 +98,8 @@
 // mc_i = a_i*R^2*R^(-1) = a_i*R in GF(p^2).
 #define sike_to_fp2mont(a, mc)           \
 do {                                     \
-    sike_fpmul_mont(a->c0, p503.mont_R2, mc->c0);   \
-    sike_fpmul_mont(a->c1, p503.mont_R2, mc->c1);   \
+    sike_fpmul_mont(a->c0, params.mont_R2, mc->c0);   \
+    sike_fpmul_mont(a->c1, params.mont_R2, mc->c1);   \
 } while(0)
 
 // Conversion of a GF(p^2) element from Montgomery representation to standard representation,
diff --git a/third_party/sike/isogeny.c b/third_party/sike/isogeny.c
index b8807f3..edb1363 100644
--- a/third_party/sike/isogeny.c
+++ b/third_party/sike/isogeny.c
@@ -189,8 +189,8 @@
   // Output: the coefficient A corresponding to the curve E_A: y^2=x^3+A*x^2+x.
     f2elm_t t0, t1, one = F2ELM_INIT;
 
-    extern const struct params_t p503;
-    sike_fpcopy(p503.mont_one, one->c0);
+    extern const struct params_t params;
+    sike_fpcopy(params.mont_one, one->c0);
     sike_fp2add(xP, xQ, t1);                           // t1 = xP+xQ
     sike_fp2mul_mont(xP, xQ, t0);                      // t0 = xP*xQ
     sike_fp2mul_mont(xR, t1, A);                       // A = xR*t1
diff --git a/third_party/sike/params.c b/third_party/sike/params.c
new file mode 100644
index 0000000..b13f4c8
--- /dev/null
+++ b/third_party/sike/params.c
@@ -0,0 +1,128 @@
+/********************************************************************************************
+* SIDH: an efficient supersingular isogeny cryptography library
+*
+* Abstract: supersingular isogeny parameters and generation of functions for P434
+*********************************************************************************************/
+
+#include "utils.h"
+
+// Parameters for isogeny system "SIKE"
+const struct params_t params = {
+    .prime = {
+        U64_TO_WORDS(0xFFFFFFFFFFFFFFFF), U64_TO_WORDS(0xFFFFFFFFFFFFFFFF),
+        U64_TO_WORDS(0xFFFFFFFFFFFFFFFF), U64_TO_WORDS(0xFDC1767AE2FFFFFF),
+        U64_TO_WORDS(0x7BC65C783158AEA3), U64_TO_WORDS(0x6CFC5FD681C52056),
+        U64_TO_WORDS(0x0002341F27177344)
+    },
+    .prime_p1 = {
+        U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000),
+        U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0xFDC1767AE3000000),
+        U64_TO_WORDS(0x7BC65C783158AEA3), U64_TO_WORDS(0x6CFC5FD681C52056),
+        U64_TO_WORDS(0x0002341F27177344)
+    },
+    .prime_x2 = {
+        U64_TO_WORDS(0xFFFFFFFFFFFFFFFE), U64_TO_WORDS(0xFFFFFFFFFFFFFFFF),
+        U64_TO_WORDS(0xFFFFFFFFFFFFFFFF), U64_TO_WORDS(0xFB82ECF5C5FFFFFF),
+        U64_TO_WORDS(0xF78CB8F062B15D47), U64_TO_WORDS(0xD9F8BFAD038A40AC),
+        U64_TO_WORDS(0x0004683E4E2EE688)
+    },
+    .A_gen = {
+        U64_TO_WORDS(0x05ADF455C5C345BF), U64_TO_WORDS(0x91935C5CC767AC2B),
+        U64_TO_WORDS(0xAFE4E879951F0257), U64_TO_WORDS(0x70E792DC89FA27B1),
+        U64_TO_WORDS(0xF797F526BB48C8CD), U64_TO_WORDS(0x2181DB6131AF621F),
+        U64_TO_WORDS(0x00000A1C08B1ECC4), // XPA0
+        U64_TO_WORDS(0x74840EB87CDA7788), U64_TO_WORDS(0x2971AA0ECF9F9D0B),
+        U64_TO_WORDS(0xCB5732BDF41715D5), U64_TO_WORDS(0x8CD8E51F7AACFFAA),
+        U64_TO_WORDS(0xA7F424730D7E419F), U64_TO_WORDS(0xD671EB919A179E8C),
+        U64_TO_WORDS(0x0000FFA26C5A924A), // XPA1
+        U64_TO_WORDS(0xFEC6E64588B7273B), U64_TO_WORDS(0xD2A626D74CBBF1C6),
+        U64_TO_WORDS(0xF8F58F07A78098C7), U64_TO_WORDS(0xE23941F470841B03),
+        U64_TO_WORDS(0x1B63EDA2045538DD), U64_TO_WORDS(0x735CFEB0FFD49215),
+        U64_TO_WORDS(0x0001C4CB77542876), // XQA0
+        U64_TO_WORDS(0xADB0F733C17FFDD6), U64_TO_WORDS(0x6AFFBD037DA0A050),
+        U64_TO_WORDS(0x680EC43DB144E02F), U64_TO_WORDS(0x1E2E5D5FF524E374),
+        U64_TO_WORDS(0xE2DDA115260E2995), U64_TO_WORDS(0xA6E4B552E2EDE508),
+        U64_TO_WORDS(0x00018ECCDDF4B53E), // XQA1
+        U64_TO_WORDS(0x01BA4DB518CD6C7D), U64_TO_WORDS(0x2CB0251FE3CC0611),
+        U64_TO_WORDS(0x259B0C6949A9121B), U64_TO_WORDS(0x60E17AC16D2F82AD),
+        U64_TO_WORDS(0x3AA41F1CE175D92D), U64_TO_WORDS(0x413FBE6A9B9BC4F3),
+        U64_TO_WORDS(0x00022A81D8D55643), // XRA0
+        U64_TO_WORDS(0xB8ADBC70FC82E54A), U64_TO_WORDS(0xEF9CDDB0D5FADDED),
+        U64_TO_WORDS(0x5820C734C80096A0), U64_TO_WORDS(0x7799994BAA96E0E4),
+        U64_TO_WORDS(0x044961599E379AF8), U64_TO_WORDS(0xDB2B94FBF09F27E2),
+        U64_TO_WORDS(0x0000B87FC716C0C6)  // XRA1
+    },
+    .B_gen = {
+        U64_TO_WORDS(0x6E5497556EDD48A3), U64_TO_WORDS(0x2A61B501546F1C05),
+        U64_TO_WORDS(0xEB919446D049887D), U64_TO_WORDS(0x5864A4A69D450C4F),
+        U64_TO_WORDS(0xB883F276A6490D2B), U64_TO_WORDS(0x22CC287022D5F5B9),
+        U64_TO_WORDS(0x0001BED4772E551F), // XPB0
+        U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000),
+        U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000),
+        U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000),
+        U64_TO_WORDS(0x0000000000000000), // XPB1
+        U64_TO_WORDS(0xFAE2A3F93D8B6B8E), U64_TO_WORDS(0x494871F51700FE1C),
+        U64_TO_WORDS(0xEF1A94228413C27C), U64_TO_WORDS(0x498FF4A4AF60BD62),
+        U64_TO_WORDS(0xB00AD2A708267E8A), U64_TO_WORDS(0xF4328294E017837F),
+        U64_TO_WORDS(0x000034080181D8AE), // XQB0
+        U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000),
+        U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000),
+        U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000),
+        U64_TO_WORDS(0x0000000000000000), // XQB1
+        U64_TO_WORDS(0x283B34FAFEFDC8E4), U64_TO_WORDS(0x9208F44977C3E647),
+        U64_TO_WORDS(0x7DEAE962816F4E9A), U64_TO_WORDS(0x68A2BA8AA262EC9D),
+        U64_TO_WORDS(0x8176F112EA43F45B), U64_TO_WORDS(0x02106D022634F504),
+        U64_TO_WORDS(0x00007E8A50F02E37), // XRB0
+        U64_TO_WORDS(0xB378B7C1DA22CCB1), U64_TO_WORDS(0x6D089C99AD1D9230),
+        U64_TO_WORDS(0xEBE15711813E2369), U64_TO_WORDS(0x2B35A68239D48A53),
+        U64_TO_WORDS(0x445F6FD138407C93), U64_TO_WORDS(0xBEF93B29A3F6B54B),
+        U64_TO_WORDS(0x000173FA910377D3)  // XRB1
+    },
+    .mont_R2 = {
+        U64_TO_WORDS(0x28E55B65DCD69B30), U64_TO_WORDS(0xACEC7367768798C2),
+        U64_TO_WORDS(0xAB27973F8311688D), U64_TO_WORDS(0x175CC6AF8D6C7C0B),
+        U64_TO_WORDS(0xABCD92BF2DDE347E), U64_TO_WORDS(0x69E16A61C7686D9A),
+        U64_TO_WORDS(0x000025A89BCDD12A)
+    },
+    .mont_one = {
+        U64_TO_WORDS(0x000000000000742C), U64_TO_WORDS(0x0000000000000000),
+        U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0xB90FF404FC000000),
+        U64_TO_WORDS(0xD801A4FB559FACD4), U64_TO_WORDS(0xE93254545F77410C),
+        U64_TO_WORDS(0x0000ECEEA7BD2EDA)
+    },
+    .mont_six = {
+        U64_TO_WORDS(0x000000000002B90A), U64_TO_WORDS(0x0000000000000000),
+        U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x5ADCCB2822000000),
+        U64_TO_WORDS(0x187D24F39F0CAFB4), U64_TO_WORDS(0x9D353A4D394145A0),
+        U64_TO_WORDS(0x00012559A0403298)
+    },
+    .A_strat = {
+        0x30, 0x1C, 0x10, 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01,
+        0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x08, 0x04,
+        0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01,
+        0x02, 0x01, 0x01, 0x0D, 0x07, 0x04, 0x02, 0x01, 0x01, 0x02,
+        0x01, 0x01, 0x03, 0x02, 0x01, 0x01, 0x01, 0x01, 0x05, 0x04,
+        0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x01,
+        0x15, 0x0C, 0x07, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01,
+        0x03, 0x02, 0x01, 0x01, 0x01, 0x01, 0x05, 0x03, 0x02, 0x01,
+        0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x01, 0x09, 0x05, 0x03,
+        0x02, 0x01, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x01, 0x04,
+        0x02, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01
+    },
+    .B_strat = {
+        0x42, 0x21, 0x11, 0x09, 0x05, 0x03, 0x02, 0x01, 0x01, 0x01,
+        0x01, 0x02, 0x01, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x01,
+        0x02, 0x01, 0x01, 0x08, 0x04, 0x02, 0x01, 0x01, 0x01, 0x02,
+        0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x10,
+        0x08, 0x04, 0x02, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04,
+        0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x08, 0x04, 0x02, 0x01,
+        0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01,
+        0x01, 0x20, 0x10, 0x08, 0x04, 0x03, 0x01, 0x01, 0x01, 0x01,
+        0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01,
+        0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02,
+        0x01, 0x01, 0x02, 0x01, 0x01, 0x10, 0x08, 0x04, 0x02, 0x01,
+        0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01,
+        0x01, 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04,
+        0x02, 0x01, 0x01, 0x02, 0x01, 0x01
+    }
+};
diff --git a/third_party/sike/sike.c b/third_party/sike/sike.c
index 689baa8..f6a19be 100644
--- a/third_party/sike/sike.c
+++ b/third_party/sike/sike.c
@@ -11,65 +11,25 @@
 #include <openssl/base.h>
 #include <openssl/rand.h>
 #include <openssl/mem.h>
-#include <openssl/hmac.h>
 #include <openssl/sha.h>
 
 #include "utils.h"
 #include "isogeny.h"
 #include "fpx.h"
 
-extern const struct params_t p503;
+extern const struct params_t params;
 
-// Domain separation parameters for HMAC
-static const uint8_t G[2] = {0,0};
-static const uint8_t H[2] = {1,0};
-static const uint8_t F[2] = {2,0};
-
-// SIDHp503_JINV_BYTESZ is a number of bytes used for encoding j-invariant.
-#define SIDHp503_JINV_BYTESZ    126U
-// SIDHp503_PRV_A_BITSZ is a number of bits of SIDH private key (2-isogeny)
-#define SIDHp503_PRV_A_BITSZ    250U
-// SIDHp503_PRV_A_BITSZ is a number of bits of SIDH private key (3-isogeny)
-#define SIDHp503_PRV_B_BITSZ    253U
+// SIDH_JINV_BYTESZ is a number of bytes used for encoding j-invariant.
+#define SIDH_JINV_BYTESZ    110U
+// SIDH_PRV_A_BITSZ is a number of bits of SIDH private key (2-isogeny)
+#define SIDH_PRV_A_BITSZ    216U
+// SIDH_PRV_A_BITSZ is a number of bits of SIDH private key (3-isogeny)
+#define SIDH_PRV_B_BITSZ    217U
 // MAX_INT_POINTS_ALICE is a number of points used in 2-isogeny tree computation
 #define MAX_INT_POINTS_ALICE    7U
 // MAX_INT_POINTS_ALICE is a number of points used in 3-isogeny tree computation
 #define MAX_INT_POINTS_BOB      8U
 
-// Produces HMAC-SHA256 of data |S| mac'ed with the key |key|. Result is stored in |out|
-// which must have size of at least |outsz| bytes and must be not bigger than
-// SHA256_DIGEST_LENGTH. The output of a HMAC may be truncated.
-// The |key| buffer is reused by the hmac_sum and hence, it's size must be equal
-// to SHA256_CBLOCK. The HMAC key provided in |key| buffer must be smaller or equal
-// to SHA256_DIGHEST_LENTH. |key| can overlap |out|.
-static void hmac_sum(
-    uint8_t *out, size_t outsz, const uint8_t S[2], uint8_t key[SHA256_CBLOCK]) {
-    for(size_t i=0; i<SHA256_DIGEST_LENGTH; i++) {
-        key[i] = key[i] ^ 0x36;
-    }
-    // set rest of the buffer to ipad = 0x36
-    memset(&key[SHA256_DIGEST_LENGTH], 0x36, SHA256_CBLOCK - SHA256_DIGEST_LENGTH);
-
-    SHA256_CTX ctx;
-    SHA256_Init(&ctx);
-    SHA256_Update(&ctx, key, SHA256_CBLOCK);
-    SHA256_Update(&ctx, S, 2);
-    uint8_t digest[SHA256_DIGEST_LENGTH];
-    SHA256_Final(digest, &ctx);
-
-    // XOR key with an opad = 0x5C
-    for(size_t i=0; i<SHA256_CBLOCK; i++) {
-        key[i] = key[i] ^ 0x36 ^ 0x5C;
-    }
-
-    SHA256_Init(&ctx);
-    SHA256_Update(&ctx, key, SHA256_CBLOCK);
-    SHA256_Update(&ctx, digest, SHA256_DIGEST_LENGTH);
-    SHA256_Final(digest, &ctx);
-    assert(outsz <= sizeof(digest));
-    memcpy(out, digest, outsz);
-}
-
 // Swap points.
 // If option = 0 then P <- P and Q <- Q, else if option = 0xFF...FF then P <- Q and Q <- P
 #if !defined(OPENSSL_X86_64) || defined(OPENSSL_NO_ASM)
@@ -104,7 +64,7 @@
 #endif
 }
 
-static void LADDER3PT(
+static void ladder3Pt(
     const f2elm_t xP, const f2elm_t xQ, const f2elm_t xPQ, const uint8_t* m,
     int is_A, point_proj_t R, const f2elm_t A) {
     point_proj_t R0 = POINT_PROJ_INIT, R2 = POINT_PROJ_INIT;
@@ -112,10 +72,10 @@
     crypto_word_t mask;
     int bit, swap, prevbit = 0;
 
-    const size_t nbits = is_A?SIDHp503_PRV_A_BITSZ:SIDHp503_PRV_B_BITSZ;
+    const size_t nbits = is_A?SIDH_PRV_A_BITSZ:SIDH_PRV_B_BITSZ;
 
     // Initializing constant
-    sike_fpcopy(p503.mont_one, A24[0].c0);
+    sike_fpcopy(params.mont_one, A24[0].c0);
     sike_fp2add(A24, A24, A24);
     sike_fp2add(A, A24, A24);
     sike_fp2div2(A24, A24);
@@ -123,11 +83,11 @@
 
     // Initializing points
     sike_fp2copy(xQ, R0->X);
-    sike_fpcopy(p503.mont_one, R0->Z[0].c0);
+    sike_fpcopy(params.mont_one, R0->Z[0].c0);
     sike_fp2copy(xPQ, R2->X);
-    sike_fpcopy(p503.mont_one, R2->Z[0].c0);
+    sike_fpcopy(params.mont_one, R2->Z[0].c0);
     sike_fp2copy(xP, R->X);
-    sike_fpcopy(p503.mont_one, R->Z[0].c0);
+    sike_fpcopy(params.mont_one, R->Z[0].c0);
     memset(R->Z->c1, 0, sizeof(R->Z->c1));
 
     // Main loop
@@ -141,6 +101,9 @@
         xDBLADD(R0, R2, R->X, A24);
         sike_fp2mul_mont(R2->X, R->Z, R2->X);
     }
+
+    mask = 0 - (crypto_word_t)prevbit;
+    sike_fp2cswap(R, R2, mask);
 }
 
 // Initialization of basis points
@@ -148,9 +111,9 @@
     sike_fpcopy(gen,                  XP->c0);
     sike_fpcopy(gen +   NWORDS_FIELD, XP->c1);
     sike_fpcopy(gen + 2*NWORDS_FIELD, XQ->c0);
-    memset(XQ->c1, 0, sizeof(XQ->c1));
-    sike_fpcopy(gen + 3*NWORDS_FIELD, XR->c0);
-    sike_fpcopy(gen + 4*NWORDS_FIELD, XR->c1);
+    sike_fpcopy(gen + 3*NWORDS_FIELD, XQ->c1);
+    sike_fpcopy(gen + 4*NWORDS_FIELD, XR->c0);
+    sike_fpcopy(gen + 5*NWORDS_FIELD, XR->c1);
 }
 
 // Conversion of GF(p^2) element from Montgomery to standard representation.
@@ -195,18 +158,21 @@
     unsigned int m, index = 0, pts_index[MAX_INT_POINTS_ALICE], npts = 0, ii = 0;
 
     // Initialize basis points
-    sike_init_basis(p503.A_gen, XPA, XQA, XRA);
-    sike_init_basis(p503.B_gen, phiP->X, phiQ->X, phiR->X);
-    sike_fpcopy(p503.mont_one, (phiP->Z)->c0);
-    sike_fpcopy(p503.mont_one, (phiQ->Z)->c0);
-    sike_fpcopy(p503.mont_one, (phiR->Z)->c0);
+    sike_init_basis(params.A_gen, XPA, XQA, XRA);
+    sike_init_basis(params.B_gen, phiP->X, phiQ->X, phiR->X);
+    sike_fpcopy(params.mont_one, (phiP->Z)->c0);
+    sike_fpcopy(params.mont_one, (phiQ->Z)->c0);
+    sike_fpcopy(params.mont_one, (phiR->Z)->c0);
 
-    // Initialize constants
-    sike_fpcopy(p503.mont_one, A24plus->c0);
+    // Initialize constants: A24plus = A+2C, C24 = 4C, where A=6, C=1
+    sike_fpcopy(params.mont_one, A24plus->c0);
+    sike_fp2add(A24plus, A24plus, A24plus);
     sike_fp2add(A24plus, A24plus, C24);
+    sike_fp2add(A24plus, C24, A);
+    sike_fp2add(C24, C24, A24plus);
 
     // Retrieve kernel point
-    LADDER3PT(XPA, XQA, XRA, skA, 1, R, A);
+    ladder3Pt(XPA, XQA, XRA, skA, 1, R, A);
 
     // Traverse tree
     index = 0;
@@ -215,7 +181,7 @@
             sike_fp2copy(R->X, pts[npts]->X);
             sike_fp2copy(R->Z, pts[npts]->Z);
             pts_index[npts++] = index;
-            m = p503.A_strat[ii++];
+            m = params.A_strat[ii++];
             xDBLe(R, R, A24plus, C24, (2*m));
             index += m;
         }
@@ -246,8 +212,8 @@
 
     // Format public key
     sike_fp2_encode(phiP->X, pkA);
-    sike_fp2_encode(phiQ->X, pkA + SIDHp503_JINV_BYTESZ);
-    sike_fp2_encode(phiR->X, pkA + 2*SIDHp503_JINV_BYTESZ);
+    sike_fp2_encode(phiQ->X, pkA + SIDH_JINV_BYTESZ);
+    sike_fp2_encode(phiR->X, pkA + 2*SIDH_JINV_BYTESZ);
 }
 
 // Bob's ephemeral key-pair generation
@@ -267,20 +233,21 @@
     unsigned int m, index = 0, pts_index[MAX_INT_POINTS_BOB], npts = 0, ii = 0;
 
     // Initialize basis points
-    sike_init_basis(p503.B_gen, XPB, XQB, XRB);
-    sike_init_basis(p503.A_gen, phiP->X, phiQ->X, phiR->X);
-    sike_fpcopy(p503.mont_one, (phiP->Z)->c0);
-    sike_fpcopy(p503.mont_one, (phiQ->Z)->c0);
-    sike_fpcopy(p503.mont_one, (phiR->Z)->c0);
+    sike_init_basis(params.B_gen, XPB, XQB, XRB);
+    sike_init_basis(params.A_gen, phiP->X, phiQ->X, phiR->X);
+    sike_fpcopy(params.mont_one, (phiP->Z)->c0);
+    sike_fpcopy(params.mont_one, (phiQ->Z)->c0);
+    sike_fpcopy(params.mont_one, (phiR->Z)->c0);
 
-    // Initialize constants
-    sike_fpcopy(p503.mont_one, A24plus->c0);
+    // Initialize constants: A24minus = A-2C, A24plus = A+2C, where A=6, C=1
+    sike_fpcopy(params.mont_one, A24plus->c0);
     sike_fp2add(A24plus, A24plus, A24plus);
-    sike_fp2copy(A24plus, A24minus);
-    sike_fp2neg(A24minus);
+    sike_fp2add(A24plus, A24plus, A24minus);
+    sike_fp2add(A24plus, A24minus, A);
+    sike_fp2add(A24minus, A24minus, A24plus);
 
     // Retrieve kernel point
-    LADDER3PT(XPB, XQB, XRB, skB, 0, R, A);
+    ladder3Pt(XPB, XQB, XRB, skB, 0, R, A);
 
     // Traverse tree
     index = 0;
@@ -289,7 +256,7 @@
             sike_fp2copy(R->X, pts[npts]->X);
             sike_fp2copy(R->Z, pts[npts]->Z);
             pts_index[npts++] = index;
-            m = p503.B_strat[ii++];
+            m = params.B_strat[ii++];
             xTPLe(R, R, A24minus, A24plus, m);
             index += m;
         }
@@ -320,8 +287,8 @@
 
     // Format public key
     sike_fp2_encode(phiP->X, pkB);
-    sike_fp2_encode(phiQ->X, pkB + SIDHp503_JINV_BYTESZ);
-    sike_fp2_encode(phiR->X, pkB + 2*SIDHp503_JINV_BYTESZ);
+    sike_fp2_encode(phiQ->X, pkB + SIDH_JINV_BYTESZ);
+    sike_fp2_encode(phiR->X, pkB + 2*SIDH_JINV_BYTESZ);
 }
 
 // Alice's ephemeral shared secret computation
@@ -340,17 +307,17 @@
 
     // Initialize images of Bob's basis
     fp2_decode(pkB, PKB[0]);
-    fp2_decode(pkB + SIDHp503_JINV_BYTESZ, PKB[1]);
-    fp2_decode(pkB + 2*SIDHp503_JINV_BYTESZ, PKB[2]);
+    fp2_decode(pkB + SIDH_JINV_BYTESZ, PKB[1]);
+    fp2_decode(pkB + 2*SIDH_JINV_BYTESZ, PKB[2]);
 
     // Initialize constants
-    get_A(PKB[0], PKB[1], PKB[2], A); // TODO: Can return projective A?
-    sike_fpadd(p503.mont_one, p503.mont_one, C24->c0);
+    get_A(PKB[0], PKB[1], PKB[2], A);
+    sike_fpadd(params.mont_one, params.mont_one, C24->c0);
     sike_fp2add(A, C24, A24plus);
     sike_fpadd(C24->c0, C24->c0, C24->c0);
 
     // Retrieve kernel point
-    LADDER3PT(PKB[0], PKB[1], PKB[2], skA, 1, R, A);
+    ladder3Pt(PKB[0], PKB[1], PKB[2], skA, 1, R, A);
 
     // Traverse tree
     index = 0;
@@ -359,7 +326,7 @@
             sike_fp2copy(R->X, pts[npts]->X);
             sike_fp2copy(R->Z, pts[npts]->Z);
             pts_index[npts++] = index;
-            m = p503.A_strat[ii++];
+            m = params.A_strat[ii++];
             xDBLe(R, R, A24plus, C24, (2*m));
             index += m;
         }
@@ -376,9 +343,9 @@
     }
 
     get_4_isog(R, A24plus, C24, coeff);
-    sike_fp2div2(C24, C24);
+    sike_fp2add(A24plus, A24plus, A24plus);
     sike_fp2sub(A24plus, C24, A24plus);
-    sike_fp2div2(C24, C24);
+    sike_fp2add(A24plus, A24plus, A24plus);
     j_inv(A24plus, C24, jinv);
     sike_fp2_encode(jinv, ssA);
 }
@@ -399,17 +366,17 @@
 
     // Initialize images of Alice's basis
     fp2_decode(pkA, PKB[0]);
-    fp2_decode(pkA + SIDHp503_JINV_BYTESZ, PKB[1]);
-    fp2_decode(pkA + 2*SIDHp503_JINV_BYTESZ, PKB[2]);
+    fp2_decode(pkA + SIDH_JINV_BYTESZ, PKB[1]);
+    fp2_decode(pkA + 2*SIDH_JINV_BYTESZ, PKB[2]);
 
     // Initialize constants
     get_A(PKB[0], PKB[1], PKB[2], A);
-    sike_fpadd(p503.mont_one, p503.mont_one, A24minus->c0);
+    sike_fpadd(params.mont_one, params.mont_one, A24minus->c0);
     sike_fp2add(A, A24minus, A24plus);
     sike_fp2sub(A, A24minus, A24minus);
 
     // Retrieve kernel point
-    LADDER3PT(PKB[0], PKB[1], PKB[2], skB, 0, R, A);
+    ladder3Pt(PKB[0], PKB[1], PKB[2], skB, 0, R, A);
 
     // Traverse tree
     index = 0;
@@ -418,7 +385,7 @@
             sike_fp2copy(R->X, pts[npts]->X);
             sike_fp2copy(R->Z, pts[npts]->Z);
             pts_index[npts++] = index;
-            m = p503.B_strat[ii++];
+            m = params.B_strat[ii++];
             xTPLe(R, R, A24minus, A24plus, m);
             index += m;
         }
@@ -442,17 +409,17 @@
     sike_fp2_encode(jinv, ssB);
 }
 
-int SIKE_keypair(uint8_t out_priv[SIKEp503_PRV_BYTESZ],
-                 uint8_t out_pub[SIKEp503_PUB_BYTESZ]) {
+int SIKE_keypair(uint8_t out_priv[SIKE_PRV_BYTESZ],
+                 uint8_t out_pub[SIKE_PUB_BYTESZ]) {
   int ret = 0;
 
   // Calculate private key for Alice. Needs to be in range [0, 2^0xFA - 1] and <
   // 253 bits
   BIGNUM *bn_sidh_prv = BN_new();
   if (!bn_sidh_prv ||
-      !BN_rand(bn_sidh_prv, SIDHp503_PRV_B_BITSZ, BN_RAND_TOP_ONE,
+      !BN_rand(bn_sidh_prv, SIDH_PRV_B_BITSZ, BN_RAND_TOP_ONE,
                BN_RAND_BOTTOM_ANY) ||
-      !BN_bn2le_padded(out_priv, BITS_TO_BYTES(SIDHp503_PRV_B_BITSZ),
+      !BN_bn2le_padded(out_priv, BITS_TO_BYTES(SIDH_PRV_B_BITSZ),
                        bn_sidh_prv)) {
     goto end;
   }
@@ -465,70 +432,67 @@
   return ret;
 }
 
-void SIKE_encaps(uint8_t out_shared_key[SIKEp503_SS_BYTESZ],
-                 uint8_t out_ciphertext[SIKEp503_CT_BYTESZ],
-                 const uint8_t pub_key[SIKEp503_PUB_BYTESZ]) {
+void SIKE_encaps(uint8_t out_shared_key[SIKE_SS_BYTESZ],
+                 uint8_t out_ciphertext[SIKE_CT_BYTESZ],
+                 const uint8_t pub_key[SIKE_PUB_BYTESZ]) {
   // Secret buffer is reused by the function to store some ephemeral
   // secret data. It's size must be maximum of SHA256_CBLOCK,
-  // SIKEp503_MSG_BYTESZ and SIDHp503_PRV_A_BITSZ in bytes.
+  // SIKE_MSG_BYTESZ and SIDH_PRV_A_BITSZ in bytes.
   uint8_t secret[SHA256_CBLOCK];
-  uint8_t j[SIDHp503_JINV_BYTESZ];
-  uint8_t temp[SIKEp503_MSG_BYTESZ + SIKEp503_CT_BYTESZ];
+  uint8_t j[SIDH_JINV_BYTESZ];
+  uint8_t temp[SIKE_MSG_BYTESZ + SIKE_CT_BYTESZ];
   SHA256_CTX ctx;
 
   // Generate secret key for A
-  // secret key A = HMAC({0,1}^n || pub_key), G) mod SIDHp503_PRV_A_BITSZ
-  RAND_bytes(temp, SIKEp503_MSG_BYTESZ);
+  // secret key A = SHA256({0,1}^n || pub_key)) mod SIDH_PRV_A_BITSZ
+  RAND_bytes(temp, SIKE_MSG_BYTESZ);
 
   SHA256_Init(&ctx);
-  SHA256_Update(&ctx, temp, SIKEp503_MSG_BYTESZ);
-  SHA256_Update(&ctx, pub_key, SIKEp503_PUB_BYTESZ);
+  SHA256_Update(&ctx, temp, SIKE_MSG_BYTESZ);
+  SHA256_Update(&ctx, pub_key, SIKE_PUB_BYTESZ);
   SHA256_Final(secret, &ctx);
-  hmac_sum(secret, BITS_TO_BYTES(SIDHp503_PRV_A_BITSZ), G, secret);
-  secret[BITS_TO_BYTES(SIDHp503_PRV_A_BITSZ) - 1] &=
-      (1 << (SIDHp503_PRV_A_BITSZ % 8)) - 1;
 
   // Generate public key for A - first part of the ciphertext
   gen_iso_A(secret, out_ciphertext);
 
   // Generate c1:
-  //  h = HMAC(j-invariant(secret key A, public key B), F)
+  //  h = SHA256(j-invariant)
   // c1 = h ^ m
   ex_iso_A(secret, pub_key, j);
   SHA256_Init(&ctx);
   SHA256_Update(&ctx, j, sizeof(j));
   SHA256_Final(secret, &ctx);
-  hmac_sum(secret, SIKEp503_MSG_BYTESZ, F, secret);
 
   // c1 = h ^ m
-  uint8_t *c1 = &out_ciphertext[SIKEp503_PUB_BYTESZ];
-  for (size_t i = 0; i < SIKEp503_MSG_BYTESZ; i++) {
+  uint8_t *c1 = &out_ciphertext[SIKE_PUB_BYTESZ];
+  for (size_t i = 0; i < SIKE_MSG_BYTESZ; i++) {
     c1[i] = temp[i] ^ secret[i];
   }
 
   SHA256_Init(&ctx);
-  SHA256_Update(&ctx, temp, SIKEp503_MSG_BYTESZ);
-  SHA256_Update(&ctx, out_ciphertext, SIKEp503_CT_BYTESZ);
+  SHA256_Update(&ctx, temp, SIKE_MSG_BYTESZ);
+  SHA256_Update(&ctx, out_ciphertext, SIKE_CT_BYTESZ);
   SHA256_Final(secret, &ctx);
-  // Generate shared secret out_shared_key = HMAC(m||out_ciphertext, F)
-  hmac_sum(out_shared_key, SIKEp503_SS_BYTESZ, H, secret);
+  // Generate shared secret out_shared_key = SHA256(m||out_ciphertext)
+  memcpy(out_shared_key, secret, SIKE_SS_BYTESZ);
 }
 
-void SIKE_decaps(uint8_t out_shared_key[SIKEp503_SS_BYTESZ],
-                 const uint8_t ciphertext[SIKEp503_CT_BYTESZ],
-                 const uint8_t pub_key[SIKEp503_PUB_BYTESZ],
-                 const uint8_t priv_key[SIKEp503_PRV_BYTESZ]) {
+void SIKE_decaps(uint8_t out_shared_key[SIKE_SS_BYTESZ],
+                 const uint8_t ciphertext[SIKE_CT_BYTESZ],
+                 const uint8_t pub_key[SIKE_PUB_BYTESZ],
+                 const uint8_t priv_key[SIKE_PRV_BYTESZ]) {
   // Secret buffer is reused by the function to store some ephemeral
   // secret data. It's size must be maximum of SHA256_CBLOCK,
-  // SIKEp503_MSG_BYTESZ and SIDHp503_PRV_A_BITSZ in bytes.
+  // SIKE_MSG_BYTESZ and SIDH_PRV_A_BITSZ in bytes.
   uint8_t secret[SHA256_CBLOCK];
-  uint8_t j[SIDHp503_JINV_BYTESZ];
-  uint8_t c0[SIKEp503_PUB_BYTESZ];
-  uint8_t temp[SIKEp503_MSG_BYTESZ];
-  uint8_t shared_nok[SIKEp503_MSG_BYTESZ];
+  uint8_t j[SIDH_JINV_BYTESZ];
+  uint8_t c0[SIKE_PUB_BYTESZ];
+  uint8_t temp[SIKE_MSG_BYTESZ];
+  uint8_t shared_nok[SIKE_MSG_BYTESZ];
   SHA256_CTX ctx;
 
-  RAND_bytes(shared_nok, SIKEp503_MSG_BYTESZ);
+  // This is OK as we are only using ephemeral keys in BoringSSL
+  RAND_bytes(shared_nok, SIKE_MSG_BYTESZ);
 
   // Recover m
   // Let ciphertext = c0 || c1 - both have fixed sizes
@@ -538,34 +502,30 @@
   SHA256_Init(&ctx);
   SHA256_Update(&ctx, j, sizeof(j));
   SHA256_Final(secret, &ctx);
-  hmac_sum(secret, SIKEp503_MSG_BYTESZ, F, secret);
 
   const uint8_t *c1 = &ciphertext[sizeof(c0)];
-  for (size_t i = 0; i < SIKEp503_MSG_BYTESZ; i++) {
+  for (size_t i = 0; i < SIKE_MSG_BYTESZ; i++) {
     temp[i] = c1[i] ^ secret[i];
   }
 
   SHA256_Init(&ctx);
-  SHA256_Update(&ctx, temp, SIKEp503_MSG_BYTESZ);
-  SHA256_Update(&ctx, pub_key, SIKEp503_PUB_BYTESZ);
+  SHA256_Update(&ctx, temp, SIKE_MSG_BYTESZ);
+  SHA256_Update(&ctx, pub_key, SIKE_PUB_BYTESZ);
   SHA256_Final(secret, &ctx);
-  hmac_sum(secret, BITS_TO_BYTES(SIDHp503_PRV_A_BITSZ), G, secret);
-
-  // Recover secret key A = G(m||pub_key) mod
-  secret[BITS_TO_BYTES(SIDHp503_PRV_A_BITSZ) - 1] &=
-      (1 << (SIDHp503_PRV_A_BITSZ % 8)) - 1;
 
   // Recover c0 = public key A
   gen_iso_A(secret, c0);
   crypto_word_t ok = constant_time_is_zero_w(
-      CRYPTO_memcmp(c0, ciphertext, SIKEp503_PUB_BYTESZ));
-  for (size_t i = 0; i < SIKEp503_MSG_BYTESZ; i++) {
+      CRYPTO_memcmp(c0, ciphertext, SIKE_PUB_BYTESZ));
+  for (size_t i = 0; i < SIKE_MSG_BYTESZ; i++) {
     temp[i] = constant_time_select_8(ok, temp[i], shared_nok[i]);
   }
 
   SHA256_Init(&ctx);
-  SHA256_Update(&ctx, temp, SIKEp503_MSG_BYTESZ);
-  SHA256_Update(&ctx, ciphertext, SIKEp503_CT_BYTESZ);
+  SHA256_Update(&ctx, temp, SIKE_MSG_BYTESZ);
+  SHA256_Update(&ctx, ciphertext, SIKE_CT_BYTESZ);
   SHA256_Final(secret, &ctx);
-  hmac_sum(out_shared_key, SIKEp503_SS_BYTESZ, H, secret);
+
+  // Generate shared secret out_shared_key = SHA256(m||ciphertext)
+  memcpy(out_shared_key, secret, SIKE_SS_BYTESZ);
 }
diff --git a/third_party/sike/sike.h b/third_party/sike/sike.h
index 09093cd..5819ebf 100644
--- a/third_party/sike/sike.h
+++ b/third_party/sike/sike.h
@@ -10,14 +10,14 @@
 #include <stdint.h>
 #include <openssl/base.h>
 
-#ifdef __cplusplus
+#if defined(__cplusplus)
 extern "C" {
 #endif
 
-/* SIKEp503
+/* SIKE
  *
  * SIKE is a isogeny based post-quantum key encapsulation mechanism. Description of the
- * algorithm is provided in [SIKE]. This implementation uses 503-bit field size. The code
+ * algorithm is provided in [SIKE]. This implementation uses 434-bit field size. The code
  * is based on "Additional_Implementations" from PQC NIST submission package which can
  * be found here:
  * https://csrc.nist.gov/CSRC/media/Projects/Post-Quantum-Cryptography/documents/round-1/submissions/SIKE.zip
@@ -25,39 +25,39 @@
  * [SIKE] https://sike.org/files/SIDH-spec.pdf
  */
 
-// SIKEp503_PUB_BYTESZ is the number of bytes in a public key.
-#define SIKEp503_PUB_BYTESZ 378
-// SIKEp503_PRV_BYTESZ is the number of bytes in a private key.
-#define SIKEp503_PRV_BYTESZ 32
-// SIKEp503_SS_BYTESZ is the number of bytes in a shared key.
-#define SIKEp503_SS_BYTESZ  16
-// SIKEp503_MSG_BYTESZ is the number of bytes in a random bit string concatenated
+// SIKE_PUB_BYTESZ is the number of bytes in a public key.
+#define SIKE_PUB_BYTESZ 330
+// SIKE_PRV_BYTESZ is the number of bytes in a private key.
+#define SIKE_PRV_BYTESZ 28
+// SIKE_SS_BYTESZ is the number of bytes in a shared key.
+#define SIKE_SS_BYTESZ  16
+// SIKE_MSG_BYTESZ is the number of bytes in a random bit string concatenated
 // with the public key (see 1.4 of SIKE).
-#define SIKEp503_MSG_BYTESZ 24
-// SIKEp503_SS_BYTESZ is the number of bytes in a ciphertext.
-#define SIKEp503_CT_BYTESZ  (SIKEp503_PUB_BYTESZ + SIKEp503_MSG_BYTESZ)
+#define SIKE_MSG_BYTESZ 16
+// SIKE_SS_BYTESZ is the number of bytes in a ciphertext.
+#define SIKE_CT_BYTESZ  (SIKE_PUB_BYTESZ + SIKE_MSG_BYTESZ)
 
 // SIKE_keypair outputs a public and secret key. Internally it uses BN_rand() as
 // an entropy source. In case of success function returns 1, otherwise 0.
 OPENSSL_EXPORT int SIKE_keypair(
-    uint8_t out_priv[SIKEp503_PRV_BYTESZ],
-    uint8_t out_pub[SIKEp503_PUB_BYTESZ]);
+    uint8_t out_priv[SIKE_PRV_BYTESZ],
+    uint8_t out_pub[SIKE_PUB_BYTESZ]);
 
 // SIKE_encaps generates and encrypts a random session key, writing those values to
 // |out_shared_key| and |out_ciphertext|, respectively.
 OPENSSL_EXPORT void SIKE_encaps(
-    uint8_t out_shared_key[SIKEp503_SS_BYTESZ],
-    uint8_t out_ciphertext[SIKEp503_CT_BYTESZ],
-    const uint8_t pub_key[SIKEp503_PUB_BYTESZ]);
+    uint8_t out_shared_key[SIKE_SS_BYTESZ],
+    uint8_t out_ciphertext[SIKE_CT_BYTESZ],
+    const uint8_t pub_key[SIKE_PUB_BYTESZ]);
 
 // SIKE_decaps outputs a random session key, writing it to |out_shared_key|.
 OPENSSL_EXPORT void SIKE_decaps(
-    uint8_t out_shared_key[SIKEp503_SS_BYTESZ],
-    const uint8_t ciphertext[SIKEp503_CT_BYTESZ],
-    const uint8_t pub_key[SIKEp503_PUB_BYTESZ],
-    const uint8_t priv_key[SIKEp503_PRV_BYTESZ]);
+    uint8_t out_shared_key[SIKE_SS_BYTESZ],
+    const uint8_t ciphertext[SIKE_CT_BYTESZ],
+    const uint8_t pub_key[SIKE_PUB_BYTESZ],
+    const uint8_t priv_key[SIKE_PRV_BYTESZ]);
 
-#ifdef __cplusplus
+#if defined(__cplusplus)
 }
 #endif
 
diff --git a/third_party/sike/sike_test.cc b/third_party/sike/sike_test.cc
index 1277e09..2180a52 100644
--- a/third_party/sike/sike_test.cc
+++ b/third_party/sike/sike_test.cc
@@ -12,202 +12,189 @@
  * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
 
+#include <gtest/gtest.h>
 #include <stdint.h>
 
-#include <gtest/gtest.h>
-
-#include "../../crypto/test/abi_test.h"
 #include "sike.h"
 #include "fpx.h"
+#include "../../crypto/test/abi_test.h"
 
 TEST(SIKE, RoundTrip) {
-    uint8_t sk[SIKEp503_PRV_BYTESZ] = {0};
-    uint8_t pk[SIKEp503_PUB_BYTESZ] = {0};
-    uint8_t ct[SIKEp503_CT_BYTESZ] = {0};
-    uint8_t ss_enc[SIKEp503_SS_BYTESZ] = {0};
-    uint8_t ss_dec[SIKEp503_SS_BYTESZ] = {0};
+  uint8_t sk[SIKE_PRV_BYTESZ] = {0};
+  uint8_t pk[SIKE_PUB_BYTESZ] = {0};
+  uint8_t ct[SIKE_CT_BYTESZ] = {0};
+  uint8_t ss_enc[SIKE_SS_BYTESZ] = {0};
+  uint8_t ss_dec[SIKE_SS_BYTESZ] = {0};
 
+  for (size_t i = 0; i < 30; i++) {
     EXPECT_EQ(SIKE_keypair(sk, pk), 1);
     SIKE_encaps(ss_enc, ct, pk);
     SIKE_decaps(ss_dec, ct, pk, sk);
 
-    EXPECT_EQ(memcmp(ss_enc, ss_dec, SIKEp503_SS_BYTESZ), 0);
+    EXPECT_EQ(memcmp(ss_enc, ss_dec, SIKE_SS_BYTESZ), 0);
+  }
 }
 
 TEST(SIKE, Decapsulation) {
-    const uint8_t sk[SIKEp503_PRV_BYTESZ] = {
-        0xDB, 0xAF, 0x2C, 0x89, 0xCA, 0x5A, 0xD4, 0x9D, 0x4F, 0x13,
-        0x40, 0xDF, 0x2D, 0xB1, 0x5F, 0x4C, 0x91, 0xA7, 0x1F, 0x0B,
-        0x29, 0x15, 0x01, 0x59, 0xBC, 0x5F, 0x0B, 0x4A, 0x03, 0x27,
-        0x6F, 0x18};
+  const uint8_t sk[SIKE_PRV_BYTESZ] = {
+      0xB1, 0xFD, 0x34, 0x42, 0xDB, 0x02, 0xBC, 0x9D, 0x4C, 0xD0,
+      0x72, 0x34, 0x4D, 0xBD, 0x06, 0xDF, 0x1C, 0x7D, 0x0A, 0x88,
+      0xB2, 0x50, 0xC4, 0xF6, 0xAE, 0xE8, 0x25, 0x01};
 
-    const uint8_t pk[SIKEp503_PUB_BYTESZ] = {
-        0x07, 0xAA, 0x51, 0x45, 0x3E, 0x1F, 0x53, 0x2A, 0x0A, 0x05,
-        0x46, 0xF6, 0x54, 0x7F, 0x5D, 0x56, 0xD6, 0x76, 0xD3, 0xEA,
-        0x4B, 0x6B, 0x01, 0x9B, 0x11, 0x72, 0x6F, 0x75, 0xEA, 0x34,
-        0x3C, 0x28, 0x2C, 0x36, 0xFD, 0x77, 0xDA, 0xBE, 0xB6, 0x20,
-        0x18, 0xC1, 0x93, 0x98, 0x18, 0x86, 0x30, 0x2F, 0x2E, 0xD2,
-        0x00, 0x61, 0xFF, 0xAE, 0x78, 0xAE, 0xFB, 0x6F, 0x32, 0xAC,
-        0x06, 0xBF, 0x35, 0xF6, 0xF7, 0x5B, 0x98, 0x26, 0x95, 0xC2,
-        0xD8, 0xD6, 0x1C, 0x0E, 0x47, 0xDA, 0x76, 0xCE, 0xB5, 0xF1,
-        0x19, 0xCC, 0x01, 0xE1, 0x17, 0xA9, 0x62, 0xF7, 0x82, 0x6C,
-        0x25, 0x51, 0x25, 0xAE, 0xFE, 0xE3, 0xE2, 0xE1, 0x35, 0xAE,
-        0x2E, 0x8F, 0x38, 0xE0, 0x7C, 0x74, 0x3C, 0x1D, 0x39, 0x91,
-        0x1B, 0xC7, 0x9F, 0x8E, 0x33, 0x4E, 0x84, 0x19, 0xB8, 0xD9,
-        0xC2, 0x71, 0x35, 0x02, 0x47, 0x3E, 0x79, 0xEF, 0x47, 0xE1,
-        0xD8, 0x21, 0x96, 0x1F, 0x11, 0x59, 0x39, 0x34, 0x76, 0xEF,
-        0x3E, 0xB7, 0x4E, 0xFB, 0x7C, 0x55, 0xA1, 0x85, 0xAA, 0xAB,
-        0xAD, 0xF0, 0x09, 0xCB, 0xD1, 0xE3, 0x7C, 0x4F, 0x5D, 0x2D,
-        0xE1, 0x13, 0xF0, 0x71, 0xD9, 0xE5, 0xF6, 0xAF, 0x7F, 0xC1,
-        0x27, 0x95, 0x8D, 0x52, 0xD5, 0x96, 0x42, 0x38, 0x41, 0xF7,
-        0x24, 0x3F, 0x3A, 0xB5, 0x7E, 0x11, 0xE4, 0xF9, 0x33, 0xEE,
-        0x4D, 0xBE, 0x74, 0x48, 0xF9, 0x98, 0x04, 0x01, 0x16, 0xEB,
-        0xA9, 0x0D, 0x61, 0xC6, 0xFD, 0x4C, 0xCF, 0x98, 0x84, 0x4A,
-        0x94, 0xAC, 0x69, 0x2C, 0x02, 0x8B, 0xE3, 0xD1, 0x41, 0x0D,
-        0xF2, 0x2D, 0x46, 0x1F, 0x57, 0x1C, 0x77, 0x86, 0x18, 0xE3,
-        0x63, 0xDE, 0xF3, 0xE3, 0x02, 0x30, 0x54, 0x73, 0xAE, 0xC2,
-        0x32, 0xA2, 0xCE, 0xEB, 0xCF, 0x81, 0x46, 0x54, 0x5C, 0xF4,
-        0x5D, 0x2A, 0x03, 0x5D, 0x9C, 0xAE, 0xE0, 0x60, 0x03, 0x80,
-        0x11, 0x30, 0xA5, 0xAA, 0xD1, 0x75, 0x67, 0xE0, 0x1C, 0x2B,
-        0x6B, 0x5D, 0x83, 0xDE, 0x92, 0x9B, 0x0E, 0xD7, 0x11, 0x0F,
-        0x00, 0xC4, 0x59, 0xE4, 0x81, 0x04, 0x3B, 0xEE, 0x5C, 0x04,
-        0xD1, 0x0E, 0xD0, 0x67, 0xF5, 0xCC, 0xAA, 0x72, 0x73, 0xEA,
-        0xC4, 0x76, 0x99, 0x3B, 0x4C, 0x90, 0x2F, 0xCB, 0xD8, 0x0A,
-        0x5B, 0xEC, 0x0E, 0x0E, 0x1F, 0x59, 0xEA, 0x14, 0x8D, 0x34,
-        0x53, 0x65, 0x4C, 0x1A, 0x59, 0xA8, 0x95, 0x66, 0x60, 0xBB,
-        0xC4, 0xCC, 0x32, 0xA9, 0x8D, 0x2A, 0xAA, 0x14, 0x6F, 0x0F,
-        0x81, 0x4D, 0x32, 0x02, 0xFD, 0x33, 0x58, 0x42, 0xCF, 0xF3,
-        0x67, 0xD0, 0x9F, 0x0B, 0xB1, 0xCC, 0x18, 0xA5, 0xC4, 0x19,
-        0xB6, 0x00, 0xED, 0xFA, 0x32, 0x1A, 0x5F, 0x67, 0xC8, 0xC3,
-        0xEB, 0x0D, 0xB5, 0x9A, 0x36, 0x47, 0x82, 0x00};
+  const uint8_t pk[SIKE_PUB_BYTESZ] = {
+      0x6D, 0x8D, 0xF5, 0x7B, 0xCD, 0x47, 0xCA, 0xCB, 0x7A, 0x38, 0xB7, 0xA6,
+      0x90, 0xB7, 0x37, 0x03, 0xD4, 0x6F, 0x27, 0x73, 0x74, 0x17, 0x5A, 0xA4,
+      0x0D, 0xC6, 0x81, 0xAD, 0xDB, 0xF7, 0x18, 0xB2, 0x3C, 0x30, 0xCF, 0xAA,
+      0x08, 0x11, 0x91, 0xCC, 0x27, 0x4E, 0xF1, 0xA6, 0xB7, 0xDA, 0xD2, 0xCF,
+      0x99, 0x7F, 0xF7, 0xE1, 0xD0, 0xCE, 0x00, 0xD2, 0x4B, 0xA4, 0x33, 0xB4,
+      0x87, 0x01, 0x3F, 0x02, 0xF7, 0xF9, 0xDE, 0xC3, 0x60, 0x62, 0xDA, 0x3F,
+      0x74, 0xA9, 0x44, 0xBE, 0x19, 0xD5, 0x03, 0x2A, 0x79, 0x8C, 0xA7, 0xFF,
+      0xEA, 0xB3, 0xBB, 0xB5, 0xD4, 0x1D, 0x8F, 0x92, 0xCE, 0x62, 0x6E, 0x99,
+      0x24, 0xD7, 0x57, 0xFA, 0xCD, 0xB6, 0xE2, 0x8E, 0xFD, 0x22, 0x0E, 0x31,
+      0x21, 0x01, 0x8D, 0x79, 0xF8, 0x3E, 0x27, 0xEC, 0x43, 0x40, 0xDB, 0x82,
+      0xE5, 0xEB, 0x6C, 0x97, 0x66, 0x29, 0x15, 0x68, 0xB7, 0x4D, 0x84, 0xD1,
+      0x8A, 0x0B, 0x12, 0x36, 0x2C, 0x0C, 0x0A, 0x6E, 0x4E, 0xDE, 0xA5, 0x8A,
+      0xDE, 0x77, 0xDD, 0x70, 0x49, 0x73, 0xAC, 0x27, 0x6D, 0x8D, 0x25, 0x9A,
+      0xE4, 0x25, 0xE8, 0x95, 0x8F, 0xFE, 0x90, 0x3B, 0x00, 0x69, 0x20, 0xE8,
+      0x7C, 0xA5, 0xF5, 0x79, 0xC0, 0x61, 0x51, 0x91, 0x35, 0x25, 0x3F, 0x17,
+      0x2F, 0x70, 0x73, 0xF0, 0x89, 0xB5, 0xC8, 0x25, 0xB8, 0xE5, 0x7E, 0x34,
+      0xDD, 0x11, 0xE5, 0xD6, 0xC3, 0xD5, 0x29, 0x89, 0xC6, 0x2C, 0x99, 0x53,
+      0x1D, 0x2C, 0x77, 0xB0, 0xB6, 0xA1, 0xBD, 0x79, 0xFB, 0x4A, 0xC2, 0x48,
+      0x4C, 0x62, 0x51, 0x00, 0xE3, 0x91, 0x2A, 0xCB, 0x84, 0x03, 0x5D, 0x2D,
+      0xC8, 0x33, 0xE9, 0x14, 0xBF, 0x74, 0x21, 0xBC, 0xF4, 0x76, 0xE5, 0x42,
+      0xB8, 0xBD, 0xE2, 0xE7, 0x20, 0x95, 0x54, 0xF2, 0xED, 0xC0, 0x79, 0x38,
+      0x1E, 0xD2, 0xEA, 0x1A, 0x63, 0x85, 0xE7, 0x3A, 0xDA, 0xAD, 0xAB, 0x1B,
+      0x1E, 0x19, 0x9E, 0x73, 0xD0, 0x10, 0x2E, 0x38, 0xAC, 0x8B, 0x00, 0x6A,
+      0x30, 0x2C, 0x3D, 0x70, 0x8E, 0x39, 0x6D, 0xC0, 0x12, 0x61, 0x7D, 0x2A,
+      0x0A, 0x04, 0x95, 0x8E, 0x09, 0x3C, 0x7B, 0xEC, 0x2E, 0xBC, 0xE8, 0xE8,
+      0xE8, 0x37, 0x29, 0xC4, 0x7E, 0x76, 0x48, 0xB9, 0x3B, 0x72, 0xE5, 0x99,
+      0x9B, 0xF9, 0xE3, 0x99, 0x72, 0x3F, 0x35, 0x29, 0x85, 0xE0, 0xC8, 0xBF,
+      0xB1, 0x6B, 0xB1, 0x6E, 0x72, 0x00};
 
-    const uint8_t ct_exp[SIKEp503_CT_BYTESZ] = {
-        0xE6, 0xB7, 0xE5, 0x7B, 0xA9, 0x19, 0xD1, 0x2C, 0xB8, 0x5C,
-        0x7B, 0x66, 0x74, 0xB0, 0x71, 0xA1, 0xFF, 0x71, 0x7F, 0x4B,
-        0xB5, 0xA6, 0xAF, 0x48, 0x32, 0x52, 0xD5, 0x82, 0xEE, 0x8A,
-        0xBB, 0x08, 0x1E, 0xF6, 0xAC, 0x91, 0xA2, 0xCB, 0x6B, 0x6A,
-        0x09, 0x2B, 0xD9, 0xC6, 0x27, 0xD6, 0x3A, 0x6B, 0x8D, 0xFC,
-        0xB8, 0x90, 0x8F, 0x72, 0xB3, 0xFA, 0x7D, 0x34, 0x7A, 0xC4,
-        0x7E, 0xE3, 0x30, 0xC5, 0xA0, 0xFE, 0x3D, 0x43, 0x14, 0x4E,
-        0x3A, 0x14, 0x76, 0x3E, 0xFB, 0xDF, 0xE3, 0xA8, 0xE3, 0x5E,
-        0x38, 0xF2, 0xE0, 0x39, 0x67, 0x60, 0xFD, 0xFB, 0xB4, 0x19,
-        0xCD, 0xE1, 0x93, 0xA2, 0x06, 0xCC, 0x65, 0xCD, 0x6E, 0xC8,
-        0xB4, 0x5E, 0x41, 0x4B, 0x6C, 0xA5, 0xF4, 0xE4, 0x9D, 0x52,
-        0x8C, 0x25, 0x60, 0xDD, 0x3D, 0xA9, 0x7F, 0xF2, 0x88, 0xC1,
-        0x0C, 0xEE, 0x97, 0xE0, 0xE7, 0x3B, 0xB7, 0xD3, 0x6F, 0x28,
-        0x79, 0x2F, 0x50, 0xB2, 0x4F, 0x74, 0x3A, 0x0C, 0x88, 0x27,
-        0x98, 0x3A, 0x27, 0xD3, 0x26, 0x83, 0x59, 0x49, 0x81, 0x5B,
-        0x0D, 0xA7, 0x0C, 0x4F, 0xEF, 0xFB, 0x1E, 0xAF, 0xE9, 0xD2,
-        0x1C, 0x10, 0x25, 0xEC, 0x9E, 0xFA, 0x57, 0x36, 0xAA, 0x3F,
-        0xC1, 0xA3, 0x2C, 0xE9, 0xB5, 0xC9, 0xED, 0x72, 0x51, 0x4C,
-        0x02, 0xB4, 0x7B, 0xB3, 0xED, 0x9F, 0x45, 0x03, 0x34, 0xAC,
-        0x9A, 0x9E, 0x62, 0x5F, 0x82, 0x7A, 0x77, 0x34, 0xF9, 0x21,
-        0x94, 0xD2, 0x38, 0x3D, 0x05, 0xF0, 0x8A, 0x60, 0x1C, 0xB7,
-        0x1D, 0xF5, 0xB7, 0x53, 0x77, 0xD3, 0x9D, 0x3D, 0x70, 0x6A,
-        0xCB, 0x18, 0x20, 0x6B, 0x29, 0x17, 0x3A, 0x6D, 0xA1, 0xB2,
-        0x64, 0xDB, 0x6C, 0xE6, 0x1A, 0x95, 0xA7, 0xF4, 0x1A, 0x78,
-        0x1D, 0xA2, 0x40, 0x15, 0x41, 0x59, 0xDD, 0xEE, 0x23, 0x57,
-        0xCE, 0x36, 0x0D, 0x55, 0xBD, 0xB8, 0xFD, 0x0F, 0x35, 0xBD,
-        0x5B, 0x92, 0xD6, 0x1C, 0x84, 0x8C, 0x32, 0x64, 0xA6, 0x5C,
-        0x45, 0x18, 0x07, 0x6B, 0xF9, 0xA9, 0x43, 0x9A, 0x83, 0xCD,
-        0xB5, 0xB3, 0xD9, 0x17, 0x99, 0x2C, 0x2A, 0x8B, 0xE0, 0x8E,
-        0xAF, 0xA6, 0x4C, 0x95, 0xBB, 0x70, 0x60, 0x1A, 0x3A, 0x97,
-        0xAA, 0x2F, 0x3D, 0x22, 0x83, 0xB7, 0x4F, 0x59, 0xED, 0x3F,
-        0x4E, 0xF4, 0x19, 0xC6, 0x25, 0x0B, 0x0A, 0x5E, 0x21, 0xB9,
-        0x91, 0xB8, 0x19, 0x84, 0x48, 0x78, 0xCE, 0x27, 0xBF, 0x41,
-        0x89, 0xF6, 0x30, 0xFD, 0x6B, 0xD9, 0xB8, 0x1D, 0x72, 0x8A,
-        0x56, 0xCC, 0x2F, 0x82, 0xE4, 0x46, 0x4D, 0x75, 0xD8, 0x92,
-        0xE6, 0x9C, 0xCC, 0xD2, 0xCD, 0x35, 0xE4, 0xFC, 0x2A, 0x85,
-        0x6B, 0xA9, 0xB2, 0x27, 0xC9, 0xA1, 0xFF, 0xB3, 0x96, 0x3E,
-        0x59, 0xF6, 0x4C, 0x66, 0x56, 0x2E, 0xF5, 0x1B, 0x97, 0x32,
-        0xB0, 0x71, 0x5A, 0x9C, 0x50, 0x4B, 0x6F, 0xC4, 0xCA, 0x94,
-        0x75, 0x37, 0x46, 0x10, 0x12, 0x2F, 0x4F, 0xA3, 0x82, 0xCD,
-        0xBD, 0x7C};
+  const uint8_t ct[SIKE_CT_BYTESZ] = {
+      0xFF, 0xEB, 0xEF, 0x4A, 0xC0, 0x57, 0x0F, 0x26, 0xAC, 0x76, 0xA8, 0xB0,
+      0xA3, 0x5D, 0x9C, 0xD9, 0x25, 0xD1, 0x7F, 0x92, 0x5D, 0xF4, 0x23, 0x34,
+      0xC3, 0x03, 0x10, 0xE1, 0xB0, 0x24, 0x9B, 0x44, 0x58, 0x26, 0x13, 0x56,
+      0x83, 0x43, 0x72, 0x69, 0x28, 0x0D, 0x55, 0x07, 0x1F, 0xDB, 0xC0, 0x23,
+      0x34, 0x83, 0x1A, 0x09, 0x9B, 0x80, 0x00, 0x64, 0x56, 0xDC, 0x79, 0x7A,
+      0xD2, 0xCE, 0x23, 0xC9, 0x72, 0x27, 0xFC, 0x8D, 0xAB, 0xBF, 0xD3, 0x17,
+      0xF6, 0x91, 0x7B, 0x15, 0x93, 0x83, 0x8A, 0x4F, 0x6C, 0xCA, 0x4A, 0x94,
+      0xDA, 0xC7, 0x9D, 0xB6, 0xD6, 0xBA, 0xBD, 0x81, 0x9A, 0x78, 0xE5, 0xE5,
+      0xBE, 0x17, 0xBC, 0xCB, 0xC8, 0x23, 0x80, 0x5F, 0x75, 0xF8, 0xDB, 0x51,
+      0x55, 0x00, 0x25, 0x33, 0x52, 0x64, 0xB2, 0xD6, 0xD8, 0x9A, 0x2A, 0x9E,
+      0x29, 0x99, 0x13, 0x33, 0xE2, 0xA7, 0x98, 0xAC, 0xD7, 0x79, 0x5C, 0x2F,
+      0xBA, 0x07, 0xC3, 0x03, 0x37, 0xD6, 0xE6, 0xB5, 0xA1, 0xF5, 0x29, 0xB6,
+      0xF6, 0xC0, 0x5C, 0x44, 0x68, 0x2B, 0x0B, 0xF5, 0x00, 0x01, 0x44, 0xD5,
+      0xCC, 0x23, 0xB5, 0x27, 0x4F, 0xCA, 0xB4, 0x05, 0x01, 0xF9, 0xD4, 0x41,
+      0xE0, 0xE1, 0x1E, 0xCF, 0xA9, 0xBC, 0x79, 0xD7, 0xD5, 0xF5, 0x3C, 0xE6,
+      0x93, 0xF4, 0x6C, 0x84, 0x5A, 0x2C, 0x4B, 0xE4, 0x91, 0xB2, 0xB2, 0xB8,
+      0xAD, 0x74, 0x9A, 0x69, 0x79, 0x4C, 0x84, 0xB7, 0xBF, 0xF1, 0x68, 0x4B,
+      0xAE, 0x0F, 0x7F, 0x45, 0x3B, 0x18, 0x3F, 0xFA, 0x00, 0x48, 0xE0, 0x3A,
+      0xE2, 0xC0, 0xAE, 0x00, 0xCE, 0x90, 0x28, 0xA4, 0x1B, 0xBE, 0xCA, 0x0C,
+      0x21, 0x29, 0x64, 0x30, 0x5E, 0x35, 0xAD, 0xFD, 0x83, 0x47, 0x40, 0x6D,
+      0x15, 0x56, 0xFC, 0xF8, 0x5F, 0xAB, 0x81, 0xFE, 0x6B, 0xE9, 0x6B, 0xED,
+      0x27, 0x35, 0x7C, 0xD8, 0x2C, 0xD4, 0xF2, 0x11, 0xE6, 0xAF, 0xDF, 0xB8,
+      0x91, 0x96, 0xEB, 0xF7, 0x4C, 0x8D, 0x70, 0x77, 0x90, 0x81, 0x00, 0x09,
+      0x19, 0x27, 0x8A, 0x9E, 0xB6, 0x1A, 0xE9, 0xAC, 0x6C, 0xC9, 0xF8, 0xEA,
+      0xA2, 0x34, 0xB8, 0xAC, 0xB3, 0xB3, 0x68, 0xA1, 0xB7, 0x29, 0x55, 0xCA,
+      0x40, 0x23, 0x92, 0x5C, 0x0C, 0x79, 0x6B, 0xD6, 0x9F, 0x5B, 0xD2, 0xE6,
+      0xAE, 0x04, 0xCB, 0xEC, 0xC7, 0x88, 0x18, 0xDB, 0x7A, 0xE6, 0xD6, 0xC9,
+      0x39, 0xFD, 0x93, 0x9B, 0xC8, 0x01, 0x6F, 0x3E, 0x6C, 0x90, 0x3E, 0x73,
+      0x76, 0x99, 0x7C, 0x48, 0xDA, 0x68, 0x48, 0x80, 0x2B, 0x63};
 
-    const uint8_t ss_exp[SIKEp503_SS_BYTESZ] = {
-        0x74, 0x3D, 0x25, 0x36, 0x00, 0x24, 0x63, 0x1A, 0x39, 0x1A,
-        0xB4, 0xAD, 0x01, 0x17, 0x78, 0xE9};
+  const uint8_t ss_exp[SIKE_SS_BYTESZ] = {0xA1, 0xF9, 0x5A, 0x67, 0xB9, 0x3D,
+                                          0x1E, 0x72, 0xE8, 0xC5, 0x71, 0xF1,
+                                          0x4C, 0xB2, 0xAA, 0x6D};
 
-    uint8_t ss_dec[SIKEp503_SS_BYTESZ] = {0};
-    SIKE_decaps(ss_dec, ct_exp, pk, sk);
-    EXPECT_EQ(memcmp(ss_dec, ss_exp, sizeof(ss_exp)), 0);
+  uint8_t ss_dec[SIKE_SS_BYTESZ] = {0};
+  SIKE_decaps(ss_dec, ct, pk, sk);
+  EXPECT_EQ(memcmp(ss_dec, ss_exp, sizeof(ss_exp)), 0);
 }
 
 // SIKE_encaps and SIKE_keypair doesn't return zeros.
 TEST(SIKE, NonZero) {
-    uint8_t sk[SIKEp503_PRV_BYTESZ] = {0};
-    uint8_t pk[SIKEp503_PUB_BYTESZ] = {0};
-    uint8_t ct[SIKEp503_CT_BYTESZ] = {0};
-    uint8_t ss[SIKEp503_SS_BYTESZ] = {0};
+  uint8_t sk[SIKE_PRV_BYTESZ] = {0};
+  uint8_t pk[SIKE_PUB_BYTESZ] = {0};
+  uint8_t ct[SIKE_CT_BYTESZ] = {0};
+  uint8_t ss[SIKE_SS_BYTESZ] = {0};
 
-    // Check secret and public key returned by SIKE_keypair
-    EXPECT_EQ(SIKE_keypair(sk, pk), 1);
-    uint8_t tmp = 0;
-    for (size_t i=0; i<sizeof(sk); i++) tmp|=sk[i];
-    EXPECT_NE(tmp, 0);
+  // Check secret and public key returned by SIKE_keypair
+  EXPECT_EQ(SIKE_keypair(sk, pk), 1);
+  uint8_t tmp = 0;
+  for (size_t i = 0; i < sizeof(sk); i++) {
+    tmp |= sk[i];
+  }
+  EXPECT_NE(tmp, 0);
 
-    tmp = 0;
-    for (size_t i=0; i<sizeof(pk); i++) tmp|=pk[i];
-    EXPECT_NE(tmp, 0);
+  tmp = 0;
+  for (size_t i = 0; i < sizeof(pk); i++) {
+    tmp |= pk[i];
+  }
+  EXPECT_NE(tmp, 0);
 
-    // Check shared secret and ciphertext returned by SIKE_encaps
-    SIKE_encaps(ss, ct, pk);
-    tmp = 0;
-    for (size_t i=0; i<sizeof(ct); i++) tmp|=ct[i];
-    EXPECT_NE(tmp, 0);
+  // Check shared secret and ciphertext returned by SIKE_encaps
+  SIKE_encaps(ss, ct, pk);
+  tmp = 0;
+  for (size_t i = 0; i < sizeof(ct); i++) {
+    tmp |= ct[i];
+  }
+  EXPECT_NE(tmp, 0);
 
-    tmp = 0;
-    for (size_t i=0; i<sizeof(ss); i++) tmp|=ss[i];
-    EXPECT_NE(tmp, 0);
+  tmp = 0;
+  for (size_t i = 0; i < sizeof(ss); i++) {
+    tmp |= ss[i];
+  }
+  EXPECT_NE(tmp, 0);
 }
 
 TEST(SIKE, Negative) {
-    uint8_t sk[SIKEp503_PRV_BYTESZ] = {0};
-    uint8_t pk[SIKEp503_PUB_BYTESZ] = {0};
-    uint8_t ct[SIKEp503_CT_BYTESZ] = {0};
-    uint8_t ss_enc[SIKEp503_SS_BYTESZ] = {0};
-    uint8_t ss_dec[SIKEp503_SS_BYTESZ] = {0};
+  uint8_t sk[SIKE_PRV_BYTESZ] = {0};
+  uint8_t pk[SIKE_PUB_BYTESZ] = {0};
+  uint8_t ct[SIKE_CT_BYTESZ] = {0};
+  uint8_t ss_enc[SIKE_SS_BYTESZ] = {0};
+  uint8_t ss_dec[SIKE_SS_BYTESZ] = {0};
 
-    EXPECT_EQ(SIKE_keypair(sk, pk), 1);
-    SIKE_encaps(ss_enc, ct, pk);
+  EXPECT_EQ(SIKE_keypair(sk, pk), 1);
+  SIKE_encaps(ss_enc, ct, pk);
 
-    // Change cipertext
-    uint8_t ct_tmp[SIKEp503_CT_BYTESZ] = {0};
-    memcpy(ct_tmp, ct, sizeof(ct));
-    ct_tmp[0] = ~ct_tmp[0];
-    SIKE_decaps(ss_dec, ct_tmp, pk, sk);
-    EXPECT_NE(memcmp(ss_enc, ss_dec, SIKEp503_SS_BYTESZ), 0);
+  // Change cipertext
+  uint8_t ct_tmp[SIKE_CT_BYTESZ] = {0};
+  memcpy(ct_tmp, ct, sizeof(ct));
+  ct_tmp[0] = ~ct_tmp[0];
+  SIKE_decaps(ss_dec, ct_tmp, pk, sk);
+  EXPECT_NE(memcmp(ss_enc, ss_dec, SIKE_SS_BYTESZ), 0);
 
-    // Change secret key
-    uint8_t sk_tmp[SIKEp503_PRV_BYTESZ] = {0};
-    memcpy(sk_tmp, sk, sizeof(sk));
-    sk_tmp[0] = ~sk_tmp[0];
-    SIKE_decaps(ss_dec, ct, pk, sk_tmp);
-    EXPECT_NE(memcmp(ss_enc, ss_dec, SIKEp503_SS_BYTESZ), 0);
+  // Change secret key
+  uint8_t sk_tmp[SIKE_PRV_BYTESZ] = {0};
+  memcpy(sk_tmp, sk, sizeof(sk));
+  sk_tmp[0] = ~sk_tmp[0];
+  SIKE_decaps(ss_dec, ct, pk, sk_tmp);
+  EXPECT_NE(memcmp(ss_enc, ss_dec, SIKE_SS_BYTESZ), 0);
 
-    // Change public key
-    uint8_t pk_tmp[SIKEp503_PUB_BYTESZ] = {0};
-    memcpy(pk_tmp, pk, sizeof(pk));
-    pk_tmp[0] = ~pk_tmp[0];
-    SIKE_decaps(ss_dec, ct, pk_tmp, sk);
-    EXPECT_NE(memcmp(ss_enc, ss_dec, SIKEp503_SS_BYTESZ), 0);
+  // Change public key
+  uint8_t pk_tmp[SIKE_PUB_BYTESZ] = {0};
+  memcpy(pk_tmp, pk, sizeof(pk));
+  pk_tmp[0] = ~pk_tmp[0];
+  SIKE_decaps(ss_dec, ct, pk_tmp, sk);
+  EXPECT_NE(memcmp(ss_enc, ss_dec, SIKE_SS_BYTESZ), 0);
 }
 
 TEST(SIKE, Unaligned) {
-  alignas(4) uint8_t priv[SIKEp503_PRV_BYTESZ + 1];
-  alignas(4) uint8_t pub[SIKEp503_PUB_BYTESZ + 1];
-  alignas(4) uint8_t shared_key1[SIKEp503_SS_BYTESZ + 1];
-  alignas(4) uint8_t ciphertext[SIKEp503_CT_BYTESZ + 1];
-  alignas(4) uint8_t shared_key2[SIKEp503_SS_BYTESZ + 1];
+  alignas(4) uint8_t priv[SIKE_PRV_BYTESZ + 1];
+  alignas(4) uint8_t pub[SIKE_PUB_BYTESZ + 1];
+  alignas(4) uint8_t shared_key1[SIKE_SS_BYTESZ + 1];
+  alignas(4) uint8_t ciphertext[SIKE_CT_BYTESZ + 1];
+  alignas(4) uint8_t shared_key2[SIKE_SS_BYTESZ + 1];
 
   ASSERT_TRUE(SIKE_keypair(priv + 1, pub + 1));
   SIKE_encaps(shared_key1 + 1, ciphertext + 1, pub + 1);
   SIKE_decaps(shared_key2 + 1, ciphertext + 1, pub + 1, priv + 1);
 
-  EXPECT_EQ(memcmp(shared_key1 + 1, shared_key2 + 1, SIKEp503_SS_BYTESZ), 0);
+  EXPECT_EQ(memcmp(shared_key1 + 1, shared_key2 + 1, SIKE_SS_BYTESZ), 0);
 }
 
-#if defined(SUPPORTS_ABI_TEST) && (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64))
+#if defined(SUPPORTS_ABI_TEST) && \
+    (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64))
 TEST(SIKE, ABI) {
   felm_t a, b, c;
   dfelm_t d, e, f;
@@ -219,4 +206,46 @@
   CHECK_ABI(sike_mpsubx2_asm, d, e, f);
   CHECK_ABI(sike_mpdblsubx2_asm, d, e, f);
 }
+
+// Additional tests for checking if assembly implementation
+// of MUL and REDC handles carry chains correctly.
+TEST(SIKE, CarryChains) {
+  // Expected results
+  const dfelm_t exp_mul = {
+    0x0000000000000001, 0x0000000000000000, 0x0000000000000000,
+    0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+    0x0000000000000000, 0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF,
+    0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF,
+    0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF,
+  };
+
+  const felm_t exp_redc = {
+    0x93AA0C8C2D3235BE, 0xA8CD35DDDE399B46, 0xB9BBA5469509CA65,
+    0x6B2FB3A5A2FB86E4, 0x585591BA6DBE862C, 0xD92D3FF5FE0938F2,
+    0x0001E1F0EE75A1E1
+  };
+
+  // Input
+  dfelm_t in14 = {
+    0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF,
+    0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF,
+    0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF,
+    0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF,
+    0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
+  };
+
+  felm_t in7 = {
+    0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF,
+    0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF,
+    0xFFFFFFFFFFFFFFFF
+  };
+
+  dfelm_t res;
+  sike_mpmul(in7, in7, res);
+  EXPECT_EQ(memcmp(exp_mul, res, sizeof(exp_mul)), 0);
+
+  // modifies in14 and in7
+  sike_fprdc(in14, in7);
+  EXPECT_EQ(memcmp(exp_redc, in7, sizeof(exp_redc)), 0);
+}
 #endif  // SUPPORTS_ABI_TEST && (X86_64 || AARCH64)
diff --git a/third_party/sike/utils.h b/third_party/sike/utils.h
index 74c640a..cbc8329 100644
--- a/third_party/sike/utils.h
+++ b/third_party/sike/utils.h
@@ -1,7 +1,7 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
 *
-* Abstract: internal header file for P503
+* Abstract: internal header file for P434
 *********************************************************************************************/
 
 #ifndef UTILS_H_
@@ -16,33 +16,33 @@
 #define BITS_TO_BYTES(nbits)      (((nbits)+7)/8)
 
 // Bit size of the field
-#define BITS_FIELD             503
+#define BITS_FIELD              434
 // Byte size of the field
 #define FIELD_BYTESZ            BITS_TO_BYTES(BITS_FIELD)
-// Number of 64-bit words of a 256-bit element
-#define NBITS_ORDER             256
+// Number of 64-bit words of a 224-bit element
+#define NBITS_ORDER             224
 #define NWORDS64_ORDER          ((NBITS_ORDER+63)/64)
 // Number of elements in Alice's strategy
-#define A_max                   125
+#define A_max                   108
 // Number of elements in Bob's strategy
-#define B_max                   159
+#define B_max                   137
 // Word size size
 #define RADIX                   sizeof(crypto_word_t)*8
 // Byte size of a limb
 #define LSZ                     sizeof(crypto_word_t)
 
 #if defined(OPENSSL_64_BIT)
-    // Number of words of a 503-bit field element
-    #define NWORDS_FIELD    8
-    // Number of "0" digits in the least significant part of p503 + 1
-    #define p503_ZERO_WORDS 3
+    // Number of words of a 434-bit field element
+    #define NWORDS_FIELD    7
+    // Number of "0" digits in the least significant part of p434 + 1
+    #define ZERO_WORDS 3
     // U64_TO_WORDS expands |x| for a |crypto_word_t| array literal.
     #define U64_TO_WORDS(x) UINT64_C(x)
 #else
-    // Number of words of a 503-bit field element
-    #define NWORDS_FIELD    16
-    // Number of "0" digits in the least significant part of p503 + 1
-    #define p503_ZERO_WORDS 7
+    // Number of words of a 434-bit field element
+    #define NWORDS_FIELD    14
+    // Number of "0" digits in the least significant part of p434 + 1
+    #define ZERO_WORDS 6
     // U64_TO_WORDS expands |x| for a |crypto_word_t| array literal.
     #define U64_TO_WORDS(x) \
         (uint32_t)(UINT64_C(x) & 0xffffffff), (uint32_t)(UINT64_C(x) >> 32)
@@ -88,15 +88,15 @@
 #define F2ELM_INIT {{ {0}, {0} }}
 #define POINT_PROJ_INIT {{ F2ELM_INIT, F2ELM_INIT }}
 
-// Datatype for representing 503-bit field elements (512-bit max.)
-// Elements over GF(p503) are encoded in 63 octets in little endian format
+// Datatype for representing 434-bit field elements (448-bit max.)
+// Elements over GF(p434) are encoded in 63 octets in little endian format
 // (i.e., the least significant octet is located in the lowest memory address).
 typedef crypto_word_t felm_t[NWORDS_FIELD];
 
 // An element in F_{p^2}, is composed of two coefficients from F_p, * i.e.
 // Fp2 element = c0 + c1*i in F_{p^2}
-// Datatype for representing double-precision 2x503-bit field elements (512-bit max.)
-// Elements (a+b*i) over GF(p503^2), where a and b are defined over GF(p503), are
+// Datatype for representing double-precision 2x434-bit field elements (448-bit max.)
+// Elements (a+b*i) over GF(p434^2), where a and b are defined over GF(p434), are
 // encoded as {a, b}, with a in the lowest memory portion.
 typedef struct {
     felm_t c0;
@@ -106,28 +106,30 @@
 // Our F_{p^2} element type is a pointer to the struct.
 typedef fp2 f2elm_t[1];
 
-// Datatype for representing double-precision 2x503-bit
+// Datatype for representing double-precision 2x434-bit
 // field elements in contiguous memory.
 typedef crypto_word_t dfelm_t[2*NWORDS_FIELD];
 
-// Constants used during SIKEp503 computation.
+// Constants used during SIKE computation.
 struct params_t {
-    // Stores P503 prime
+    // Stores a prime
     const crypto_word_t prime[NWORDS_FIELD];
-    // Stores P503 + 1
+    // Stores prime + 1
     const crypto_word_t prime_p1[NWORDS_FIELD];
-    // Stores P503 * 2
+    // Stores prime * 2
     const crypto_word_t prime_x2[NWORDS_FIELD];
-    // Alice's generator values {XPA0 + XPA1*i, XQA0, XRA0 + XRA1*i}
-    // in GF(p503^2), expressed in Montgomery representation
-    const crypto_word_t A_gen[5*NWORDS_FIELD];
-    // Bob's generator values {XPB0 + XPB1*i, XQB0, XRB0 + XRB1*i}
-    // in GF(p503^2), expressed in Montgomery representation
-    const crypto_word_t B_gen[5*NWORDS_FIELD];
-    // Montgomery constant mont_R2 = (2^512)^2 mod p503
+    // Alice's generator values {XPA0 + XPA1*i, XQA0 + XQA1*i, XRA0 + XRA1*i}
+    // in GF(prime^2), expressed in Montgomery representation
+    const crypto_word_t A_gen[6*NWORDS_FIELD];
+    // Bob's generator values {XPB0 + XPB1*i, XQB0 + XQB1*i, XRB0 + XRB1*i}
+    // in GF(prime^2), expressed in Montgomery representation
+    const crypto_word_t B_gen[6*NWORDS_FIELD];
+    // Montgomery constant mont_R2 = (2^448)^2 mod prime
     const crypto_word_t mont_R2[NWORDS_FIELD];
     // Value 'one' in Montgomery representation
     const crypto_word_t mont_one[NWORDS_FIELD];
+    // Value '6' in Montgomery representation
+    const crypto_word_t mont_six[NWORDS_FIELD];
     // Fixed parameters for isogeny tree computation
     const unsigned int A_strat[A_max-1];
     const unsigned int B_strat[B_max-1];
diff --git a/tool/speed.cc b/tool/speed.cc
index 47edc75..3929cf6 100644
--- a/tool/speed.cc
+++ b/tool/speed.cc
@@ -296,14 +296,14 @@
   return true;
 }
 
-static bool SpeedSIKEP503(const std::string &selected) {
+static bool SpeedSIKEP434(const std::string &selected) {
   if (!selected.empty() && selected.find("SIKE") == std::string::npos) {
     return true;
   }
   // speed generation
-  uint8_t public_SIKE[SIKEp503_PUB_BYTESZ];
-  uint8_t private_SIKE[SIKEp503_PRV_BYTESZ];
-  uint8_t ct[SIKEp503_CT_BYTESZ];
+  uint8_t public_SIKE[SIKE_PUB_BYTESZ];
+  uint8_t private_SIKE[SIKE_PRV_BYTESZ];
+  uint8_t ct[SIKE_CT_BYTESZ];
   bool res;
 
   {
@@ -312,7 +312,7 @@
                 [&private_SIKE, &public_SIKE]() -> bool {
       return (SIKE_keypair(private_SIKE, public_SIKE) == 1);
     });
-    results.Print("SIKE/P503 generate");
+    results.Print("SIKE/P434 generate");
   }
 
   if (!res) {
@@ -324,11 +324,11 @@
     TimeResults results;
     TimeFunction(&results,
                 [&ct, &public_SIKE]() -> bool {
-      uint8_t ss[SIKEp503_SS_BYTESZ];
+      uint8_t ss[SIKE_SS_BYTESZ];
       SIKE_encaps(ss, ct, public_SIKE);
       return true;
     });
-    results.Print("SIKE/P503 encap");
+    results.Print("SIKE/P434 encap");
   }
 
   if (!res) {
@@ -340,11 +340,11 @@
     TimeResults results;
     TimeFunction(&results,
                 [&ct, &public_SIKE, &private_SIKE]() -> bool {
-      uint8_t ss[SIKEp503_SS_BYTESZ];
+      uint8_t ss[SIKE_SS_BYTESZ];
       SIKE_decaps(ss, ct, public_SIKE, private_SIKE);
       return true;
     });
-    results.Print("SIKE/P503 decap");
+    results.Print("SIKE/P434 decap");
   }
 
   if (!res) {
@@ -998,7 +998,7 @@
       !SpeedECDH(selected) ||
       !SpeedECDSA(selected) ||
       !Speed25519(selected) ||
-      !SpeedSIKEP503(selected) ||
+      !SpeedSIKEP434(selected) ||
       !SpeedSPAKE2(selected) ||
       !SpeedScrypt(selected) ||
       !SpeedRSAKeyGen(selected) ||