Patch out the aes_nohw fallback in bsaes_cbc_encrypt.

This plugs all bsaes fallback leaks for CBC outside of the key schedule.
The CBC EVP_CIPHERs never call the block function directly when there's
a stream.cbc function available.

This affects CBC decryptions of length < 128 or 16 mod 128.
Performance-wise, we don't really care about CBC apart from passing
glances at its use in TLS. There, the Lucky13 workaround mutes the
effects.

Cortex-A53 (Raspberry Pi 3 Model B+)
Before:
Did 78000 AES-128-CBC-SHA1 (16 bytes) open operations in 3020254us (25825.6 ops/sec): 0.4 MB/s
Did 75000 AES-128-CBC-SHA1 (32 bytes) open operations in 3005760us (24952.1 ops/sec): 0.8 MB/s
Did 71000 AES-128-CBC-SHA1 (64 bytes) open operations in 3038137us (23369.6 ops/sec): 1.5 MB/s
Did 67000 AES-128-CBC-SHA1 (96 bytes) open operations in 3027686us (22129.1 ops/sec): 2.1 MB/s
Did 64000 AES-128-CBC-SHA1 (112 bytes) open operations in 3005491us (21294.4 ops/sec): 2.4 MB/s
Did 59000 AES-128-CBC-SHA1 (128 bytes) open operations in 3020083us (19535.9 ops/sec): 2.5 MB/s
Did 53000 AES-128-CBC-SHA1 (240 bytes) open operations in 3020105us (17549.1 ops/sec): 4.2 MB/s
After:
Did 71668 AES-128-CBC-SHA1 (16 bytes) open operations in 3020896us (23724.1 ops/sec): 0.4 MB/s
Did 71000 AES-128-CBC-SHA1 (32 bytes) open operations in 3040826us (23348.9 ops/sec): 0.7 MB/s
Did 68000 AES-128-CBC-SHA1 (64 bytes) open operations in 3009913us (22592.0 ops/sec): 1.4 MB/s
Did 66000 AES-128-CBC-SHA1 (96 bytes) open operations in 3007597us (21944.4 ops/sec): 2.1 MB/s
Did 59000 AES-128-CBC-SHA1 (112 bytes) open operations in 3002878us (19647.8 ops/sec): 2.2 MB/s
Did 59000 AES-128-CBC-SHA1 (128 bytes) open operations in 3046786us (19364.7 ops/sec): 2.5 MB/s
Did 50000 AES-128-CBC-SHA1 (240 bytes) open operations in 3043643us (16427.7 ops/sec): 3.9 MB/s

Penryn (Mac mini, mid 2010)
Before:
Did 152000 AES-128-CBC-SHA1 (16 bytes) open operations in 1004422us (151330.8 ops/sec): 2.4 MB/s
Did 143000 AES-128-CBC-SHA1 (32 bytes) open operations in 1000443us (142936.7 ops/sec): 4.6 MB/s
Did 136000 AES-128-CBC-SHA1 (48 bytes) open operations in 1006580us (135111.0 ops/sec): 6.5 MB/s
Did 146000 AES-128-CBC-SHA1 (96 bytes) open operations in 1005731us (145168.0 ops/sec): 13.9 MB/s
Did 138000 AES-128-CBC-SHA1 (112 bytes) open operations in 1003330us (137542.0 ops/sec): 15.4 MB/s
Did 133000 AES-128-CBC-SHA1 (128 bytes) open operations in 1005876us (132223.1 ops/sec): 16.9 MB/s
Did 117000 AES-128-CBC-SHA1 (240 bytes) open operations in 1004922us (116426.9 ops/sec): 27.9 MB/s
After:
Did 159000 AES-128-CBC-SHA1 (16 bytes) open operations in 1000505us (158919.7 ops/sec): 2.5 MB/s
Did 157000 AES-128-CBC-SHA1 (32 bytes) open operations in 1006091us (156049.5 ops/sec): 5.0 MB/s
Did 154000 AES-128-CBC-SHA1 (48 bytes) open operations in 1002720us (153582.3 ops/sec): 7.4 MB/s
Did 146000 AES-128-CBC-SHA1 (96 bytes) open operations in 1002567us (145626.2 ops/sec): 14.0 MB/s
Did 135000 AES-128-CBC-SHA1 (112 bytes) open operations in 1001212us (134836.6 ops/sec): 15.1 MB/s
Did 133000 AES-128-CBC-SHA1 (128 bytes) open operations in 1006441us (132148.8 ops/sec): 16.9 MB/s
Did 115000 AES-128-CBC-SHA1 (240 bytes) open operations in 1005246us (114399.9 ops/sec): 27.5 MB/s

Bug: 256
Change-Id: I864b4455ada0d4d245380fce6f869dabb0686354
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/35167
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/aes/asm/bsaes-armv7.pl b/crypto/fipsmodule/aes/asm/bsaes-armv7.pl
index c6e0b17..d4db3b4 100644
--- a/crypto/fipsmodule/aes/asm/bsaes-armv7.pl
+++ b/crypto/fipsmodule/aes/asm/bsaes-armv7.pl
@@ -1113,26 +1113,12 @@
 my ($keysched)=("sp");
 
 $code.=<<___;
-@ TODO(davidben): This should be aes_nohw_cbc_encrypt, but that function does
-@ not exist. Rather than add it, patch this fallback out. See
-@ https://crbug.com/boringssl/256.
-.extern AES_cbc_encrypt
-.extern aes_nohw_decrypt
-
 .global	bsaes_cbc_encrypt
 .type	bsaes_cbc_encrypt,%function
 .align	5
 bsaes_cbc_encrypt:
-#ifndef	__KERNEL__
-	cmp	$len, #128
-#ifndef	__thumb__
-	blo	AES_cbc_encrypt
-#else
-	bhs	1f
-	b	AES_cbc_encrypt
-1:
-#endif
-#endif
+	@ In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for
+	@ short inputs. We patch this out, using bsaes for all input sizes.
 
 	@ it is up to the caller to make sure we are called with enc == 0
 
@@ -1230,10 +1216,7 @@
 	adds	$len, $len, #8
 	beq	.Lcbc_dec_done
 
-	vld1.8	{@XMM[0]}, [$inp]!		@ load input
-	cmp	$len, #2
-	blo	.Lcbc_dec_one
-	vld1.8	{@XMM[1]}, [$inp]!
+	@ Set up most parameters for the _bsaes_decrypt8 call.
 #ifndef	BSAES_ASM_EXTENDED_KEY
 	mov	r4, $keysched			@ pass the key
 #else
@@ -1241,6 +1224,11 @@
 #endif
 	mov	r5, $rounds
 	vstmia	$fp, {@XMM[15]}			@ put aside IV
+
+	vld1.8	{@XMM[0]}, [$inp]!		@ load input
+	cmp	$len, #2
+	blo	.Lcbc_dec_one
+	vld1.8	{@XMM[1]}, [$inp]!
 	beq	.Lcbc_dec_two
 	vld1.8	{@XMM[2]}, [$inp]!
 	cmp	$len, #4
@@ -1358,16 +1346,11 @@
 .align	4
 .Lcbc_dec_one:
 	sub	$inp, $inp, #0x10
-	mov	$rounds, $out			@ save original out pointer
-	mov	$out, $fp			@ use the iv scratch space as out buffer
-	mov	r2, $key
-	vmov	@XMM[4],@XMM[15]		@ just in case ensure that IV
-	vmov	@XMM[5],@XMM[0]			@ and input are preserved
-	bl	aes_nohw_decrypt
-	vld1.8	{@XMM[0]}, [$fp]		@ load result
-	veor	@XMM[0], @XMM[0], @XMM[4]	@ ^= IV
-	vmov	@XMM[15], @XMM[5]		@ @XMM[5] holds input
-	vst1.8	{@XMM[0]}, [$rounds]		@ write output
+	bl	_bsaes_decrypt8
+	vldmia	$fp, {@XMM[14]}			@ reload IV
+	vld1.8	{@XMM[15]}, [$inp]!		@ reload input
+	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
+	vst1.8	{@XMM[0]}, [$out]!		@ write output
 
 .Lcbc_dec_done:
 #ifndef	BSAES_ASM_EXTENDED_KEY
diff --git a/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl b/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl
index 899490f..3bb2819 100644
--- a/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl
+++ b/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl
@@ -811,8 +811,6 @@
 $code.=<<___;
 .text
 
-.extern	aes_nohw_decrypt
-
 .type	_bsaes_encrypt8,\@abi-omnipotent
 .align	64
 _bsaes_encrypt8:
@@ -1608,22 +1606,14 @@
 ___
 }
 $code.=<<___;
-.extern	aes_nohw_cbc_encrypt
 .globl	bsaes_cbc_encrypt
 .type	bsaes_cbc_encrypt,\@abi-omnipotent
 .align	16
 bsaes_cbc_encrypt:
 .cfi_startproc
-___
-$code.=<<___ if ($win64);
-	mov	48(%rsp),$arg6		# pull direction flag
-___
-$code.=<<___;
-	cmp	\$0,$arg6
-	jne	aes_nohw_cbc_encrypt
-	cmp	\$128,$arg3
-	jb	aes_nohw_cbc_encrypt
-
+	# In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for
+	# short inputs or if enc is one. We patch this out, using bsaes for all
+	# input sizes. The caller is required to ensure enc is zero.
 	mov	%rsp, %rax
 .Lcbc_dec_prologue:
 	push	%rbp
@@ -1682,6 +1672,8 @@
 
 	movdqu	(%rbx), @XMM[15]	# load IV
 	sub	\$8,$len
+	jc	.Lcbc_dec_loop_done
+
 .Lcbc_dec_loop:
 	movdqu	0x00($inp), @XMM[0]	# load input
 	movdqu	0x10($inp), @XMM[1]
@@ -1726,6 +1718,7 @@
 	sub	\$8,$len
 	jnc	.Lcbc_dec_loop
 
+.Lcbc_dec_loop_done:
 	add	\$8,$len
 	jz	.Lcbc_dec_done
 
@@ -1858,13 +1851,12 @@
 	jmp	.Lcbc_dec_done
 .align	16
 .Lcbc_dec_one:
-	lea	($inp), $arg1
-	lea	0x20(%rbp), $arg2	# buffer output
-	lea	($key), $arg3
-	call	aes_nohw_decrypt		# doesn't touch %xmm
-	pxor	0x20(%rbp), @XMM[15]	# ^= IV
-	movdqu	@XMM[15], ($out)	# write output
-	movdqa	@XMM[0], @XMM[15]	# IV
+	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
+	call	_bsaes_decrypt8
+	pxor	0x20(%rbp), @XMM[0]	# ^= IV
+	movdqu	0x00($inp), @XMM[15]	# IV
+	movdqu	@XMM[0], 0x00($out)	# write output
+	jmp	.Lcbc_dec_done
 
 .Lcbc_dec_done:
 	movdqu	@XMM[15], (%rbx)	# return IV
diff --git a/crypto/fipsmodule/aes/internal.h b/crypto/fipsmodule/aes/internal.h
index a05abcb..63070bc 100644
--- a/crypto/fipsmodule/aes/internal.h
+++ b/crypto/fipsmodule/aes/internal.h
@@ -133,7 +133,7 @@
 
 #if defined(BSAES)
 // On platforms where BSAES gets defined (just above), then these functions are
-// provided by asm.
+// provided by asm. Note |bsaes_cbc_encrypt| requires |enc| to be zero.
 void bsaes_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t length,
                        const AES_KEY *key, uint8_t ivec[16], int enc);
 void bsaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,