Add NEON vpaes-to-bsaes key converters.

This was translated from
https://boringssl-review.googlesource.com/c/boringssl/+/33588

vpaes is disappointing on NEON, but we have no constant-time key
schedule functions for bsaes. Implement key conversion functions.

Bug: 256
Change-Id: Icf5fd6a9a948b8fb18f7a0cdd60a1c4d57bb9332
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/37427
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/aes/aes_test.cc b/crypto/fipsmodule/aes/aes_test.cc
index f83c105..5061e01 100644
--- a/crypto/fipsmodule/aes/aes_test.cc
+++ b/crypto/fipsmodule/aes/aes_test.cc
@@ -22,6 +22,7 @@
 #include <gtest/gtest.h>
 
 #include <openssl/aes.h>
+#include <openssl/rand.h>
 
 #include "internal.h"
 #include "../../internal.h"
@@ -304,7 +305,8 @@
 #endif
 
     if (bsaes_capable()) {
-      aes_nohw_set_encrypt_key(kKey, bits, &key);
+      vpaes_set_encrypt_key(kKey, bits, &key);
+      CHECK_ABI(vpaes_encrypt_key_to_bsaes, &key, &key);
       for (size_t blocks : block_counts) {
         SCOPED_TRACE(blocks);
         if (blocks != 0) {
@@ -312,7 +314,8 @@
         }
       }
 
-      aes_nohw_set_decrypt_key(kKey, bits, &key);
+      vpaes_set_decrypt_key(kKey, bits, &key);
+      CHECK_ABI(vpaes_decrypt_key_to_bsaes, &key, &key);
       for (size_t blocks : block_counts) {
         SCOPED_TRACE(blocks);
         CHECK_ABI(bsaes_cbc_encrypt, buf, buf, AES_BLOCK_SIZE * blocks, &key,
@@ -374,3 +377,52 @@
   }
 }
 #endif  // SUPPORTS_ABI_TEST
+
+#if defined(BSAES) && !defined(BORINGSSL_SHARED_LIBRARY)
+static Bytes AESKeyToBytes(const AES_KEY *key) {
+  return Bytes(reinterpret_cast<const uint8_t *>(key), sizeof(*key));
+}
+
+TEST(AESTest, VPAESToBSAESConvert) {
+  const int kNumIterations = 1000;
+  for (int i = 0; i < kNumIterations; i++) {
+    uint8_t key[256 / 8];
+    RAND_bytes(key, sizeof(key));
+    SCOPED_TRACE(Bytes(key));
+    for (unsigned bits : {128u, 192u, 256u}) {
+      SCOPED_TRACE(bits);
+      for (bool enc : {false, true}) {
+        SCOPED_TRACE(enc);
+        AES_KEY nohw, vpaes, bsaes;
+        OPENSSL_memset(&nohw, 0xaa, sizeof(nohw));
+        OPENSSL_memset(&vpaes, 0xaa, sizeof(vpaes));
+        OPENSSL_memset(&bsaes, 0xaa, sizeof(bsaes));
+
+        if (enc) {
+          aes_nohw_set_encrypt_key(key, bits, &nohw);
+          vpaes_set_encrypt_key(key, bits, &vpaes);
+          vpaes_encrypt_key_to_bsaes(&bsaes, &vpaes);
+        } else {
+          aes_nohw_set_decrypt_key(key, bits, &nohw);
+          vpaes_set_decrypt_key(key, bits, &vpaes);
+          vpaes_decrypt_key_to_bsaes(&bsaes, &vpaes);
+        }
+
+        // Although not fatal, stop running if this fails, otherwise we'll spam
+        // the user's console.
+        ASSERT_EQ(AESKeyToBytes(&nohw), AESKeyToBytes(&bsaes));
+
+        // Repeat the test in-place.
+        OPENSSL_memcpy(&bsaes, &vpaes, sizeof(AES_KEY));
+        if (enc) {
+          vpaes_encrypt_key_to_bsaes(&bsaes, &vpaes);
+        } else {
+          vpaes_decrypt_key_to_bsaes(&bsaes, &vpaes);
+        }
+
+        ASSERT_EQ(AESKeyToBytes(&nohw), AESKeyToBytes(&bsaes));
+      }
+    }
+  }
+}
+#endif  // !NO_ASM && X86_64 && !SHARED_LIBRARY
diff --git a/crypto/fipsmodule/aes/asm/vpaes-armv7.pl b/crypto/fipsmodule/aes/asm/vpaes-armv7.pl
index a756321..d147c96 100644
--- a/crypto/fipsmodule/aes/asm/vpaes-armv7.pl
+++ b/crypto/fipsmodule/aes/asm/vpaes-armv7.pl
@@ -198,6 +198,50 @@
 	.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
 	.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
 
+
+@ Additional constants for converting to bsaes.
+
+@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear
+@ transform in the AES S-box. 0x63 is incorporated into the low half of the
+@ table. This was computed with the following script:
+@
+@   def u64s_to_u128(x, y):
+@       return x | (y << 64)
+@   def u128_to_u64s(w):
+@       return w & ((1<<64)-1), w >> 64
+@   def get_byte(w, i):
+@       return (w >> (i*8)) & 0xff
+@   def apply_table(table, b):
+@       lo = b & 0xf
+@       hi = b >> 4
+@       return get_byte(table[0], lo) ^ get_byte(table[1], hi)
+@   def opt(b):
+@       table = [
+@           u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808),
+@           u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0),
+@       ]
+@       return apply_table(table, b)
+@   def rot_byte(b, n):
+@       return 0xff & ((b << n) | (b >> (8-n)))
+@   def skew(x):
+@       return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^
+@               rot_byte(x, 4))
+@   table = [0, 0]
+@   for i in range(16):
+@       table[0] |= (skew(opt(i)) ^ 0x63) << (i*8)
+@       table[1] |= skew(opt(i<<4)) << (i*8)
+@   print("\t.quad\t0x%016x, 0x%016x" % u128_to_u64s(table[0]))
+@   print("\t.quad\t0x%016x, 0x%016x" % u128_to_u64s(table[1]))
+.Lk_opt_then_skew:
+	.quad	0x9cb8436798bc4763, 0x6440bb9f6044bf9b
+	.quad	0x1f30062936192f00, 0xb49bad829db284ab
+
+@ .Lk_decrypt_transform is a permutation which performs an 8-bit left-rotation
+@ followed by a byte-swap on each 32-bit word of a vector. E.g., 0x11223344
+@ becomes 0x22334411 and then 0x11443322.
+.Lk_decrypt_transform:
+	.quad	0x0704050603000102, 0x0f0c0d0e0b08090a
+
 .asciz  "Vector Permutation AES for ARMv7 NEON, Mike Hamburg (Stanford University)"
 .size	_vpaes_consts,.-_vpaes_consts
 .align	6
@@ -1047,6 +1091,196 @@
 ___
 }
 
+{
+my ($out, $inp) = map("r$_", (0..1));
+my ($s0F, $s63, $s63_raw, $mc_forward) = map("q$_", (9..12));
+
+$code .= <<___;
+@ void vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes);
+.globl	vpaes_encrypt_key_to_bsaes
+.type	vpaes_encrypt_key_to_bsaes,%function
+.align	4
+vpaes_encrypt_key_to_bsaes:
+	stmdb	sp!, {r11, lr}
+
+	@ See _vpaes_schedule_core for the key schedule logic. In particular,
+	@ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper),
+	@ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last
+	@ contain the transformations not in the bsaes representation. This
+	@ function inverts those transforms.
+	@
+	@ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
+	@ representation, which does not match the other aes_nohw_*
+	@ implementations. The ARM aes_nohw_* stores each 32-bit word
+	@ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
+	@ cost of extra REV and VREV32 operations in little-endian ARM.
+
+	adr	r2, .Lk_mc_forward
+	adr	r3, .Lk_sr+0x10
+	adr	r11, .Lk_opt		@ Input to _vpaes_schedule_transform.
+	vld1.64	{$mc_forward}, [r2]
+	vmov.i8	$s0F, #0x0f		@ Required by _vpaes_schedule_transform
+	vmov.i8	$s63, #0x5b		@ .Lk_s63 from vpaes-x86_64
+	vmov.i8	$s63_raw, #0x63		@ .LK_s63 without .Lk_ipt applied
+
+	@ vpaes stores one fewer round count than bsaes, but the number of keys
+	@ is the same.
+	ldr	r2, [$inp,#240]
+	add	r2, r2, #1
+	str	r2, [$out,#240]
+
+	@ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt).
+	@ Invert this with .Lk_opt.
+	vld1.64	{q0}, [$inp]!
+	bl	_vpaes_schedule_transform
+	vrev32.8	q0, q0
+	vst1.64	{q0}, [$out]!
+
+	@ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied,
+	@ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63,
+	@ multiplies by the circulant 0,1,1,1, then applies ShiftRows.
+.Loop_enc_key_to_bsaes:
+	vld1.64	{q0}, [$inp]!
+
+	@ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle
+	@ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30.
+	@ We use r3 rather than r8 to avoid a callee-saved register.
+	vld1.64	{q1}, [r3]
+	vtbl.8  q2#lo, {q0}, q1#lo
+	vtbl.8  q2#hi, {q0}, q1#hi
+	add	r3, r3, #16
+	and	r3, r3, #~(1<<6)
+	vmov	q0, q2
+
+	@ Handle the last key differently.
+	subs	r2, r2, #1
+	beq	.Loop_enc_key_to_bsaes_last
+
+	@ Multiply by the circulant. This is its own inverse.
+	vtbl.8	q1#lo, {q0}, $mc_forward#lo
+	vtbl.8	q1#hi, {q0}, $mc_forward#hi
+	vmov	q0, q1
+	vtbl.8	q2#lo, {q1}, $mc_forward#lo
+	vtbl.8	q2#hi, {q1}, $mc_forward#hi
+	veor	q0, q0, q2
+	vtbl.8	q1#lo, {q2}, $mc_forward#lo
+	vtbl.8	q1#hi, {q2}, $mc_forward#hi
+	veor	q0, q0, q1
+
+	@ XOR and finish.
+	veor	q0, q0, $s63
+	bl	_vpaes_schedule_transform
+	vrev32.8	q0, q0
+	vst1.64	{q0}, [$out]!
+	b	.Loop_enc_key_to_bsaes
+
+.Loop_enc_key_to_bsaes_last:
+	@ The final key does not have a basis transform (note
+	@ .Lschedule_mangle_last inverts the original transform). It only XORs
+	@ 0x63 and applies ShiftRows. The latter was already inverted in the
+	@ loop. Note that, because we act on the original representation, we use
+	@ $s63_raw, not $s63.
+	veor	q0, q0, $s63_raw
+	vrev32.8	q0, q0
+	vst1.64	{q0}, [$out]
+
+	@ Wipe registers which contained key material.
+	veor	q0, q0, q0
+	veor	q1, q1, q1
+	veor	q2, q2, q2
+
+	ldmia	sp!, {r11, pc}	@ return
+.size	vpaes_encrypt_key_to_bsaes,.-vpaes_encrypt_key_to_bsaes
+
+@ void vpaes_decrypt_key_to_bsaes(AES_KEY *vpaes, const AES_KEY *bsaes);
+.globl	vpaes_decrypt_key_to_bsaes
+.type	vpaes_decrypt_key_to_bsaes,%function
+.align	4
+vpaes_decrypt_key_to_bsaes:
+	stmdb	sp!, {r11, lr}
+
+	@ See _vpaes_schedule_core for the key schedule logic. Note vpaes
+	@ computes the decryption key schedule in reverse. Additionally,
+	@ aes-x86_64.pl shares some transformations, so we must only partially
+	@ invert vpaes's transformations. In general, vpaes computes in a
+	@ different basis (.Lk_ipt and .Lk_opt) and applies the inverses of
+	@ MixColumns, ShiftRows, and the affine part of the AES S-box (which is
+	@ split into a linear skew and XOR of 0x63). We undo all but MixColumns.
+	@
+	@ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
+	@ representation, which does not match the other aes_nohw_*
+	@ implementations. The ARM aes_nohw_* stores each 32-bit word
+	@ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
+	@ cost of extra REV and VREV32 operations in little-endian ARM.
+
+	adr	r2, .Lk_decrypt_transform
+	adr	r3, .Lk_sr+0x30
+	adr	r11, .Lk_opt_then_skew	@ Input to _vpaes_schedule_transform.
+	vld1.64	{$mc_forward}, [r2]	@ Reuse $mc_forward from encryption.
+	vmov.i8	$s0F, #0x0f		@ Required by _vpaes_schedule_transform
+
+	@ vpaes stores one fewer round count than bsaes, but the number of keys
+	@ is the same.
+	ldr	r2, [$inp,#240]
+	add	r2, r2, #1
+	str	r2, [$out,#240]
+
+	@ Undo the basis change and reapply the S-box affine transform. See
+	@ .Lschedule_mangle_last.
+	vld1.64	{q0}, [$inp]!
+	bl	_vpaes_schedule_transform
+	vrev32.8	q0, q0
+	vst1.64	{q0}, [$out]!
+
+	@ See _vpaes_schedule_mangle for the transform on the middle keys. Note
+	@ it simultaneously inverts MixColumns and the S-box affine transform.
+	@ See .Lk_dksd through .Lk_dks9.
+.Loop_dec_key_to_bsaes:
+	vld1.64	{q0}, [$inp]!
+
+	@ Invert the ShiftRows step (see .Lschedule_mangle_both). Note going
+	@ forwards cancels inverting for which direction we cycle r3. We use r3
+	@ rather than r8 to avoid a callee-saved register.
+	vld1.64	{q1}, [r3]
+	vtbl.8  q2#lo, {q0}, q1#lo
+	vtbl.8  q2#hi, {q0}, q1#hi
+	add	r3, r3, #64-16
+	and	r3, r3, #~(1<<6)
+	vmov	q0, q2
+
+	@ Handle the last key differently.
+	subs	r2, r2, #1
+	beq	.Loop_dec_key_to_bsaes_last
+
+	@ Undo the basis change and reapply the S-box affine transform.
+	bl	_vpaes_schedule_transform
+
+	@ Rotate each word by 8 bytes (cycle the rows) and then byte-swap. We
+	@ combine the two operations in .Lk_decrypt_transform.
+	@
+	@ TODO(davidben): Where does the rotation come from?
+	vtbl.8	q1#lo, {q0}, $mc_forward#lo
+	vtbl.8	q1#hi, {q0}, $mc_forward#hi
+
+	vst1.64	{q1}, [$out]!
+	b	.Loop_dec_key_to_bsaes
+
+.Loop_dec_key_to_bsaes_last:
+	@ The final key only inverts ShiftRows (already done in the loop). See
+	@ .Lschedule_am_decrypting. Its basis is not transformed.
+	vrev32.8	q0, q0
+	vst1.64	{q0}, [$out]!
+
+	@ Wipe registers which contained key material.
+	veor	q0, q0, q0
+	veor	q1, q1, q1
+	veor	q2, q2, q2
+
+	ldmia	sp!, {r11, pc}	@ return
+.size	vpaes_decrypt_key_to_bsaes,.-vpaes_decrypt_key_to_bsaes
+___
+}
+
 foreach (split("\n",$code)) {
 	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
 	print $_,"\n";
diff --git a/crypto/fipsmodule/aes/internal.h b/crypto/fipsmodule/aes/internal.h
index 5428b54..8471a80 100644
--- a/crypto/fipsmodule/aes/internal.h
+++ b/crypto/fipsmodule/aes/internal.h
@@ -134,12 +134,14 @@
 
 
 #if defined(BSAES)
-// On platforms where BSAES gets defined (just above), then these functions are
-// provided by asm. Note |bsaes_cbc_encrypt| requires |enc| to be zero.
+// Note |bsaes_cbc_encrypt| requires |enc| to be zero.
 void bsaes_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t length,
                        const AES_KEY *key, uint8_t ivec[16], int enc);
 void bsaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
                                 const AES_KEY *key, const uint8_t ivec[16]);
+// VPAES to BSAES conversions are available on all BSAES platforms.
+void vpaes_encrypt_key_to_bsaes(AES_KEY *out_bsaes, const AES_KEY *vpaes);
+void vpaes_decrypt_key_to_bsaes(AES_KEY *out_bsaes, const AES_KEY *vpaes);
 #else
 OPENSSL_INLINE char bsaes_capable(void) { return 0; }
 
@@ -156,6 +158,16 @@
                                                const uint8_t ivec[16]) {
   abort();
 }
+
+OPENSSL_INLINE void vpaes_encrypt_key_to_bsaes(AES_KEY *out_bsaes,
+                                               const AES_KEY *vpaes) {
+  abort();
+}
+
+OPENSSL_INLINE void vpaes_decrypt_key_to_bsaes(AES_KEY *out_bsaes,
+                                               const AES_KEY *vpaes) {
+  abort();
+}
 #endif  // !BSAES