Use a mix of bsaes and vpaes for CTR on NEON.

tl;dr: AES is now constant-time on 32-bit ARM with NEON. Combined with
all the past work, we now have constant-time AES and GHASH on ARM and
x86 chips, 32-bit and 64-bit, provided NEON (required by Chrome on
Android, aside from https://crbug.com/341598) or SSSE3 (almost all
Chrome on Windows users) is available!

CTR-like bsaes modes is harder to resolve than CBC decryption. They use
both bulk (ctr128_f) and one-off (block128_f) operations. We currently
use ctr128_f of bsaes and block128_f of aes_nohw (not constant-time),
which hits 22.0 MB/s on my test chip.

Implement a vpaes/bsaes hybrid to get the best of both worlds. The key
is kept in vpaes form and, when the input is large enough, we convert
the key to bsaes on-demand. This retains bsaes performance, but with no
variable-time gaps.

Alternatives considered:

- Convert to bsaes form immediately and only use bsaes. This makes the
  one-off block128_f calls very expensive. One 8-block batch of
  bsaes_ctr32_encrypt_blocks costs as much as 5.76 vpaes_encrypt calls.

- Do the above, but fold the one-off calls into bsaes batches because
  GCM is parallelizable. This is a mess with the current internal
  structure and doesn't apply to, e.g., CCM.

- Drop bsaes in favor of vpaes. However, even with
  vpaes_ctr32_encrypt_blocks, vpaes is 15.5 MB/s. The hybrid is a 40%
  win on an important platform.

- Try to narrow the gap, as we did for x86_64, with a "2x" optimization.
  I attempted this here but the register pressure was tricky. (x86_64
  was already tight and NEON can't address memory in vtbl.) If I ignored
  this (gives wrong answer), the gap was still 20-25%. Perf here is
  slower overall (20 MB/s for old ARM vs 120-140 MB/s for old x86_64),
  so that gap is scarier.

I retained vpaes_ctr32_encrypt_blocks because it's fairly compact (only
84 bytes assembled), though it's less important in the bsaes hybrid.

Cortex-A53 (Raspberry Pi 3 Model B+)
Before:
Did 267000 AES-128-GCM (16 bytes) seal operations in 2004871us (133175.7 ops/sec): 2.1 MB/s
Did 135000 AES-128-GCM (256 bytes) seal operations in 2013825us (67036.6 ops/sec): 17.2 MB/s
Did 31000 AES-128-GCM (1350 bytes) seal operations in 2059039us (15055.6 ops/sec): 20.3 MB/s
Did 5565 AES-128-GCM (8192 bytes) seal operations in 2073607us (2683.7 ops/sec): 22.0 MB/s
Did 2709 AES-128-GCM (16384 bytes) seal operations in 2020264us (1340.9 ops/sec): 22.0 MB/s
Did 209000 AES-256-GCM (16 bytes) seal operations in 2005654us (104205.4 ops/sec): 1.7 MB/s
Did 109000 AES-256-GCM (256 bytes) seal operations in 2011293us (54194.0 ops/sec): 13.9 MB/s
Did 25000 AES-256-GCM (1350 bytes) seal operations in 2082385us (12005.5 ops/sec): 16.2 MB/s
Did 4452 AES-256-GCM (8192 bytes) seal operations in 2080729us (2139.6 ops/sec): 17.5 MB/s
Did 2226 AES-256-GCM (16384 bytes) seal operations in 2079819us (1070.3 ops/sec): 17.5 MB/s

After:
Did 542000 AES-128-GCM (16 bytes) seal operations in 2003408us (270539.0 ops/sec): 4.3 MB/s [+104.8%]
Did 124000 AES-128-GCM (256 bytes) seal operations in 2012579us (61612.5 ops/sec): 15.8 MB/s [-8.1%]
Did 30000 AES-128-GCM (1350 bytes) seal operations in 2020636us (14846.8 ops/sec): 20.0 MB/s [-1.5%]
Did 5502 AES-128-GCM (8192 bytes) seal operations in 2068807us (2659.5 ops/sec): 21.8 MB/s [-0.9%]
Did 2772 AES-128-GCM (16384 bytes) seal operations in 2085176us (1329.4 ops/sec): 21.8 MB/s [-0.9%]
Did 459000 AES-256-GCM (16 bytes) seal operations in 2003587us (229089.1 ops/sec): 3.7 MB/s [+117.6%]
Did 100000 AES-256-GCM (256 bytes) seal operations in 2018311us (49546.4 ops/sec): 12.7 MB/s [-8.6%]
Did 24000 AES-256-GCM (1350 bytes) seal operations in 2026975us (11840.3 ops/sec): 16.0 MB/s [-1.2%]
Did 4410 AES-256-GCM (8192 bytes) seal operations in 2079581us (2120.6 ops/sec): 17.4 MB/s [-0.6%]
Did 2226 AES-256-GCM (16384 bytes) seal operations in 2099318us (1060.3 ops/sec): 17.4 MB/s [-0.6%]

Bug: 256
Change-Id: Ib74ab7e63974d3ddae8ce5fc35c9b44e73dce305
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/37429
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/aes/asm/vpaes-armv7.pl b/crypto/fipsmodule/aes/asm/vpaes-armv7.pl
index d147c96..f9fadda 100644
--- a/crypto/fipsmodule/aes/asm/vpaes-armv7.pl
+++ b/crypto/fipsmodule/aes/asm/vpaes-armv7.pl
@@ -1281,6 +1281,65 @@
 ___
 }
 
+{
+# Register-passed parameters.
+my ($inp, $out, $len, $key) = map("r$_", 0..3);
+# Temporaries. _vpaes_encrypt_core already uses r8..r11, so overlap $ivec and
+# $tmp. $ctr is r7 because it must be preserved across calls.
+my ($ctr, $ivec, $tmp) = map("r$_", 7..9);
+
+# void vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
+#                                 const AES_KEY *key, const uint8_t ivec[16]);
+$code .= <<___;
+.globl	vpaes_ctr32_encrypt_blocks
+.type	vpaes_ctr32_encrypt_blocks,%function
+.align	4
+vpaes_ctr32_encrypt_blocks:
+	mov	ip, sp
+	stmdb	sp!, {r7-r11, lr}
+	@ This function uses q4-q7 (d8-d15), which are callee-saved.
+	vstmdb	sp!, {d8-d15}
+
+	cmp	$len, #0
+	@ $ivec is passed on the stack.
+	ldr	$ivec, [ip]
+	beq	.Lctr32_done
+
+	@ _vpaes_encrypt_core expects the key in r2, so swap $len and $key.
+	mov	$tmp, $key
+	mov	$key, $len
+	mov	$len, $tmp
+___
+my ($len, $key) = ($key, $len);
+$code .= <<___;
+
+	@ Load the IV and counter portion.
+	ldr	$ctr, [$ivec, #12]
+	vld1.8	{q7}, [$ivec]
+
+	bl	_vpaes_preheat
+	rev	$ctr, $ctr		@ The counter is big-endian.
+
+.Lctr32_loop:
+	vmov	q0, q7
+	vld1.8	{q6}, [$inp]!		@ Load input ahead of time
+	bl	_vpaes_encrypt_core
+	veor	q0, q0, q6		@ XOR input and result
+	vst1.8	{q0}, [$out]!
+	subs	$len, $len, #1
+	@ Update the counter.
+	add	$ctr, $ctr, #1
+	rev	$tmp, $ctr
+	vmov.32	q7#hi[1], $tmp
+	bne	.Lctr32_loop
+
+.Lctr32_done:
+	vldmia	sp!, {d8-d15}
+	ldmia	sp!, {r7-r11, pc}	@ return
+.size	vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
+___
+}
+
 foreach (split("\n",$code)) {
 	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
 	print $_,"\n";
diff --git a/crypto/fipsmodule/aes/internal.h b/crypto/fipsmodule/aes/internal.h
index 8471a80..99d509a 100644
--- a/crypto/fipsmodule/aes/internal.h
+++ b/crypto/fipsmodule/aes/internal.h
@@ -51,6 +51,7 @@
 #if defined(OPENSSL_ARM)
 #define BSAES
 #define VPAES
+#define VPAES_CTR32
 OPENSSL_INLINE int bsaes_capable(void) { return CRYPTO_is_NEON_capable(); }
 OPENSSL_INLINE int vpaes_capable(void) { return CRYPTO_is_NEON_capable(); }
 #endif
diff --git a/crypto/fipsmodule/cipher/e_aes.c b/crypto/fipsmodule/cipher/e_aes.c
index 685d2db..72910ed 100644
--- a/crypto/fipsmodule/cipher/e_aes.c
+++ b/crypto/fipsmodule/cipher/e_aes.c
@@ -68,6 +68,48 @@
 OPENSSL_MSVC_PRAGMA(warning(push))
 OPENSSL_MSVC_PRAGMA(warning(disable: 4702))  // Unreachable code.
 
+#if defined(BSAES)
+static void vpaes_ctr32_encrypt_blocks_with_bsaes(const uint8_t *in,
+                                                  uint8_t *out, size_t blocks,
+                                                  const AES_KEY *key,
+                                                  const uint8_t ivec[16]) {
+  // |bsaes_ctr32_encrypt_blocks| is faster than |vpaes_ctr32_encrypt_blocks|,
+  // but it takes at least one full 8-block batch to amortize the conversion.
+  if (blocks < 8) {
+    vpaes_ctr32_encrypt_blocks(in, out, blocks, key, ivec);
+    return;
+  }
+
+  size_t bsaes_blocks = blocks;
+  if (bsaes_blocks % 8 < 6) {
+    // |bsaes_ctr32_encrypt_blocks| internally works in 8-block batches. If the
+    // final batch is too small (under six blocks), it is faster to loop over
+    // |vpaes_encrypt|. Round |bsaes_blocks| down to a multiple of 8.
+    bsaes_blocks -= bsaes_blocks % 8;
+  }
+
+  AES_KEY bsaes;
+  vpaes_encrypt_key_to_bsaes(&bsaes, key);
+  bsaes_ctr32_encrypt_blocks(in, out, bsaes_blocks, &bsaes, ivec);
+  OPENSSL_cleanse(&bsaes, sizeof(bsaes));
+
+  in += 16 * bsaes_blocks;
+  out += 16 * bsaes_blocks;
+  blocks -= bsaes_blocks;
+
+  union {
+    uint32_t u32[4];
+    uint8_t u8[16];
+  } new_ivec;
+  memcpy(new_ivec.u8, ivec, 16);
+  uint32_t ctr = CRYPTO_bswap4(new_ivec.u32[3]) + bsaes_blocks;
+  new_ivec.u32[3] = CRYPTO_bswap4(ctr);
+
+  // Finish any remaining blocks with |vpaes_ctr32_encrypt_blocks|.
+  vpaes_ctr32_encrypt_blocks(in, out, blocks, key, new_ivec.u8);
+}
+#endif  // BSAES
+
 typedef struct {
   union {
     double align;
@@ -110,6 +152,7 @@
         dat->stream.cbc = aes_hw_cbc_encrypt;
       }
     } else if (bsaes_capable() && mode == EVP_CIPH_CBC_MODE) {
+      assert(vpaes_capable());
       ret = vpaes_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
       if (ret == 0) {
         vpaes_decrypt_key_to_bsaes(&dat->ks.ks, &dat->ks.ks);
@@ -145,11 +188,6 @@
     } else if (mode == EVP_CIPH_CTR_MODE) {
       dat->stream.ctr = aes_hw_ctr32_encrypt_blocks;
     }
-  } else if (bsaes_capable() && mode == EVP_CIPH_CTR_MODE) {
-    ret = aes_nohw_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
-    // If |dat->stream.ctr| is provided, |dat->block| is never used.
-    dat->block = NULL;
-    dat->stream.ctr = bsaes_ctr32_encrypt_blocks;
   } else if (vpaes_capable()) {
     ret = vpaes_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
     dat->block = vpaes_encrypt;
@@ -159,11 +197,14 @@
       dat->stream.cbc = vpaes_cbc_encrypt;
     }
 #endif
-#if defined(VPAES_CTR32)
     if (mode == EVP_CIPH_CTR_MODE) {
+#if defined(BSAES)
+      assert(bsaes_capable());
+      dat->stream.ctr = vpaes_ctr32_encrypt_blocks_with_bsaes;
+#elif defined(VPAES_CTR32)
       dat->stream.ctr = vpaes_ctr32_encrypt_blocks;
-    }
 #endif
+    }
   } else {
     ret = aes_nohw_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
     dat->block = aes_nohw_encrypt;
@@ -252,17 +293,6 @@
     return aes_hw_ctr32_encrypt_blocks;
   }
 
-  if (bsaes_capable()) {
-    aes_nohw_set_encrypt_key(key, key_bytes * 8, aes_key);
-    if (gcm_key != NULL) {
-      CRYPTO_gcm128_init_key(gcm_key, aes_key, aes_nohw_encrypt, 0);
-    }
-    if (out_block) {
-      *out_block = aes_nohw_encrypt;
-    }
-    return bsaes_ctr32_encrypt_blocks;
-  }
-
   if (vpaes_capable()) {
     vpaes_set_encrypt_key(key, key_bytes * 8, aes_key);
     if (out_block) {
@@ -271,7 +301,10 @@
     if (gcm_key != NULL) {
       CRYPTO_gcm128_init_key(gcm_key, aes_key, vpaes_encrypt, 0);
     }
-#if defined(VPAES_CTR32)
+#if defined(BSAES)
+    assert(bsaes_capable());
+    return vpaes_ctr32_encrypt_blocks_with_bsaes;
+#elif defined(VPAES_CTR32)
     return vpaes_ctr32_encrypt_blocks;
 #else
     return NULL;