Don't use bsaes over vpaes for CTR-DRBG.

RAND_bytes rarely uses large enough inputs for bsaes to be worth it.
https://boringssl-review.googlesource.com/c/boringssl/+/33589 includes some
rough benchmarks of various bits here. Some observations:

- 8 blocks of bsaes costs roughly 6.5 blocks of vpaes. Note the comparison
  isn't quite accurate because I'm measuring bsaes_ctr32_encrypt_blocks against
  vpaes_encrypt and vpaes in CTR mode today must make do with a C loop. Even
  assuming a cutoff of 6 rather than 7 blocks, it's rare to ask for 96 bytes
  of entropy at a time.

- CTR-DRBG performs some stray block operations (ctr_drbg_update), which bsaes
  is bad at without extra work to fold them into the CTR loop (not really worth
  it).

- CTR-DRBG calculates a couple new key schedules every RAND_bytes call. We
  don't currently have a constant-time bsaes key schedule. Unfortunately, even
  plain vpaes loses to the current aes_nohw used by bsaes, but it's not
  constant-time. Also taking CTR-DRBG out of the bsaes equation

- Machines without AES hardware (clients) are not going to be RNG-bound. It's
  mostly servers pushing way too many CBC IVs that care. This means bsaes's
  current side channel tradeoffs make even less sense here.

I'm not sure yet what we should do for the rest of the bsaes mess, but it seems
clear that we want to stick with vpaes for the RNG.

Bug: 256
Change-Id: Iec8f13af232794afd007cb1065913e8117eeee24
Reviewed-on: https://boringssl-review.googlesource.com/c/34744
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/cipher_extra/e_aesccm.c b/crypto/cipher_extra/e_aesccm.c
index 3e18659..144a909 100644
--- a/crypto/cipher_extra/e_aesccm.c
+++ b/crypto/cipher_extra/e_aesccm.c
@@ -66,7 +66,8 @@
   struct aead_aes_ccm_ctx *ccm_ctx = (struct aead_aes_ccm_ctx *)&ctx->state;
 
   block128_f block;
-  ctr128_f ctr = aes_ctr_set_key(&ccm_ctx->ks.ks, NULL, &block, key, key_len);
+  ctr128_f ctr = aes_ctr_set_key(&ccm_ctx->ks.ks, NULL, &block, key, key_len,
+                                 1 /* large inputs */);
   ctx->tag_len = tag_len;
   if (!CRYPTO_ccm128_init(&ccm_ctx->ccm, &ccm_ctx->ks.ks, block, ctr, M, L)) {
     OPENSSL_PUT_ERROR(CIPHER, ERR_R_INTERNAL_ERROR);
diff --git a/crypto/cipher_extra/e_aesctrhmac.c b/crypto/cipher_extra/e_aesctrhmac.c
index 8c45c81..0834bd1 100644
--- a/crypto/cipher_extra/e_aesctrhmac.c
+++ b/crypto/cipher_extra/e_aesctrhmac.c
@@ -94,8 +94,8 @@
     return 0;
   }
 
-  aes_ctx->ctr =
-      aes_ctr_set_key(&aes_ctx->ks.ks, NULL, &aes_ctx->block, key, aes_key_len);
+  aes_ctx->ctr = aes_ctr_set_key(&aes_ctx->ks.ks, NULL, &aes_ctx->block, key,
+                                 aes_key_len, 1 /* large inputs */);
   ctx->tag_len = tag_len;
   hmac_init(&aes_ctx->inner_init_state, &aes_ctx->outer_init_state,
             key + aes_key_len);
diff --git a/crypto/cipher_extra/e_aesgcmsiv.c b/crypto/cipher_extra/e_aesgcmsiv.c
index 71a71fa..0e5063c 100644
--- a/crypto/cipher_extra/e_aesgcmsiv.c
+++ b/crypto/cipher_extra/e_aesgcmsiv.c
@@ -595,7 +595,7 @@
   OPENSSL_memset(gcm_siv_ctx, 0, sizeof(struct aead_aes_gcm_siv_ctx));
 
   aes_ctr_set_key(&gcm_siv_ctx->ks.ks, NULL, &gcm_siv_ctx->kgk_block, key,
-                  key_len);
+                  key_len, 1 /* large inputs */);
   gcm_siv_ctx->is_256 = (key_len == 32);
   ctx->tag_len = tag_len;
 
@@ -719,7 +719,8 @@
 
   OPENSSL_memcpy(out_keys->auth_key, key_material, 16);
   aes_ctr_set_key(&out_keys->enc_key.ks, NULL, &out_keys->enc_block,
-                  key_material + 16, gcm_siv_ctx->is_256 ? 32 : 16);
+                  key_material + 16, gcm_siv_ctx->is_256 ? 32 : 16,
+                  1 /* large inputs */);
 }
 
 static int aead_aes_gcm_siv_seal_scatter(
diff --git a/crypto/fipsmodule/cipher/e_aes.c b/crypto/fipsmodule/cipher/e_aes.c
index 81c74cb..7745036 100644
--- a/crypto/fipsmodule/cipher/e_aes.c
+++ b/crypto/fipsmodule/cipher/e_aes.c
@@ -214,7 +214,7 @@
 
 ctr128_f aes_ctr_set_key(AES_KEY *aes_key, GCM128_KEY *gcm_key,
                          block128_f *out_block, const uint8_t *key,
-                         size_t key_bytes) {
+                         size_t key_bytes, int large_inputs) {
   if (hwaes_capable()) {
     aes_hw_set_encrypt_key(key, key_bytes * 8, aes_key);
     if (gcm_key != NULL) {
@@ -226,7 +226,9 @@
     return aes_hw_ctr32_encrypt_blocks;
   }
 
-  if (bsaes_capable()) {
+  const int bsaes_ok = bsaes_capable();
+  const int vpaes_ok = vpaes_capable();
+  if (bsaes_ok && (large_inputs || !vpaes_ok)) {
     AES_set_encrypt_key(key, key_bytes * 8, aes_key);
     if (gcm_key != NULL) {
       CRYPTO_gcm128_init_key(gcm_key, aes_key, AES_encrypt, 0);
@@ -237,7 +239,7 @@
     return bsaes_ctr32_encrypt_blocks;
   }
 
-  if (vpaes_capable()) {
+  if (vpaes_ok) {
     vpaes_set_encrypt_key(key, key_bytes * 8, aes_key);
     if (out_block) {
       *out_block = vpaes_encrypt;
@@ -295,7 +297,7 @@
   if (key) {
     OPENSSL_memset(&gctx->gcm, 0, sizeof(gctx->gcm));
     gctx->ctr = aes_ctr_set_key(&gctx->ks.ks, &gctx->gcm.gcm_key, NULL, key,
-                                ctx->key_len);
+                                ctx->key_len, 1 /* large inputs */);
     // If we have an iv can set it directly, otherwise use saved IV.
     if (iv == NULL && gctx->iv_set) {
       iv = gctx->iv;
@@ -838,8 +840,8 @@
     return 0;
   }
 
-  gcm_ctx->ctr =
-      aes_ctr_set_key(&gcm_ctx->ks.ks, &gcm_ctx->gcm_key, NULL, key, key_len);
+  gcm_ctx->ctr = aes_ctr_set_key(&gcm_ctx->ks.ks, &gcm_ctx->gcm_key, NULL, key,
+                                 key_len, 1 /* large inputs */);
   *out_tag_len = tag_len;
   return 1;
 }
diff --git a/crypto/fipsmodule/cipher/internal.h b/crypto/fipsmodule/cipher/internal.h
index 7c739fb..b9e61ec 100644
--- a/crypto/fipsmodule/cipher/internal.h
+++ b/crypto/fipsmodule/cipher/internal.h
@@ -116,11 +116,12 @@
 // where |key_bytes| must either be 16, 24 or 32. If not NULL, |*out_block| is
 // set to a function that encrypts single blocks. If not NULL, |*gcm_key| is
 // initialised to do GHASH with the given key. It returns a function for
-// optimised CTR-mode, or NULL if CTR-mode should be built using
-// |*out_block|.
+// optimised CTR-mode, or NULL if CTR-mode should be built using |*out_block|.
+// |large_input| is a hint to select AES implementations. If it is one, the
+// caller expects this key to be used with large inputs.
 ctr128_f aes_ctr_set_key(AES_KEY *aes_key, GCM128_KEY *gcm_key,
                          block128_f *out_block, const uint8_t *key,
-                         size_t key_bytes);
+                         size_t key_bytes, int large_input);
 
 #if defined(__cplusplus)
 }  // extern C
diff --git a/crypto/fipsmodule/rand/ctrdrbg.c b/crypto/fipsmodule/rand/ctrdrbg.c
index b2fda1d..418f56b 100644
--- a/crypto/fipsmodule/rand/ctrdrbg.c
+++ b/crypto/fipsmodule/rand/ctrdrbg.c
@@ -57,7 +57,12 @@
     seed_material[i] ^= kInitMask[i];
   }
 
-  drbg->ctr = aes_ctr_set_key(&drbg->ks, NULL, &drbg->block, seed_material, 32);
+  // |RAND_bytes| is rarely called with large enough inputs for bsaes to be
+  // faster than vpaes. bsaes also currently has side channel trade offs
+  // (https://crbug.com/boringssl/256), which we should especially avoid in the
+  // PRNG. (Note the size hint is a no-op on machines with AES instructions.)
+  drbg->ctr = aes_ctr_set_key(&drbg->ks, NULL, &drbg->block, seed_material, 32,
+                              0 /* small inputs */);
   OPENSSL_memcpy(drbg->counter.bytes, seed_material + 32, 16);
   drbg->reseed_counter = 1;
 
@@ -93,7 +98,8 @@
     temp[i] ^= data[i];
   }
 
-  drbg->ctr = aes_ctr_set_key(&drbg->ks, NULL, &drbg->block, temp, 32);
+  drbg->ctr = aes_ctr_set_key(&drbg->ks, NULL, &drbg->block, temp, 32,
+                              0 /* small inputs */);
   OPENSSL_memcpy(drbg->counter.bytes, temp + 32, 16);
 
   return 1;