Patch out the aes_nohw fallback in bsaes_ctr32_encrypt_blocks.

bsaes_ctr32_encrypt_blocks previously fell back to the table-based
aes_nohw_encrypt for inputs under 128 bytes. Instead, just run the usual
bsaes code, though it means we compute more blocks than needed.

This fixes some (but not all) the timing leaks and is needed for later
bsaes work.

Performance-wise, x86_64 actually sees a performance improvement for all but
tiny inputs. ARM does see a loss at small inputs however.

Cortex-A53 (Raspberry Pi 3 Model B+)
Before:
Did 299000 AES-128-GCM (16 bytes) seal operations in 1001123us (298664.6 ops/sec): 4.8 MB/s
Did 236000 AES-128-GCM (32 bytes) seal operations in 1001611us (235620.4 ops/sec): 7.5 MB/s
Did 167000 AES-128-GCM (64 bytes) seal operations in 1005706us (166052.5 ops/sec): 10.6 MB/s
Did 129000 AES-128-GCM (96 bytes) seal operations in 1006129us (128214.2 ops/sec): 12.3 MB/s
Did 116000 AES-128-GCM (112 bytes) seal operations in 1006302us (115273.5 ops/sec): 12.9 MB/s
Did 107000 AES-128-GCM (128 bytes) seal operations in 1000986us (106894.6 ops/sec): 13.7 MB/s
After:
Did 132000 AES-128-GCM (16 bytes) seal operations in 1005165us (131321.7 ops/sec): 2.1 MB/s
Did 128000 AES-128-GCM (32 bytes) seal operations in 1005966us (127240.9 ops/sec): 4.1 MB/s
Did 120000 AES-128-GCM (64 bytes) seal operations in 1003080us (119631.5 ops/sec): 7.7 MB/s
Did 113000 AES-128-GCM (96 bytes) seal operations in 1000557us (112937.1 ops/sec): 10.8 MB/s
Did 110000 AES-128-GCM (112 bytes) seal operations in 1000407us (109955.2 ops/sec): 12.3 MB/s
Did 108000 AES-128-GCM (128 bytes) seal operations in 1008830us (107054.7 ops/sec): 13.7 MB/s
(Inputs 128 bytes and up are unaffected by this CL.)

Nexus 7
Before:
Did 544000 AES-128-GCM (16 bytes) seal operations in 1001282us (543303.5 ops/sec): 8.7 MB/s
Did 475750 AES-128-GCM (32 bytes) seal operations in 1000244us (475633.9 ops/sec): 15.2 MB/s
Did 370500 AES-128-GCM (64 bytes) seal operations in 1000519us (370307.8 ops/sec): 23.7 MB/s
Did 300750 AES-128-GCM (96 bytes) seal operations in 1000122us (300713.3 ops/sec): 28.9 MB/s
Did 275750 AES-128-GCM (112 bytes) seal operations in 1000702us (275556.6 ops/sec): 30.9 MB/s
Did 251000 AES-128-GCM (128 bytes) seal operations in 1000214us (250946.3 ops/sec): 32.1 MB/s
After:
Did 296000 AES-128-GCM (16 bytes) seal operations in 1001129us (295666.2 ops/sec): 4.7 MB/s
Did 288750 AES-128-GCM (32 bytes) seal operations in 1000488us (288609.2 ops/sec): 9.2 MB/s
Did 267250 AES-128-GCM (64 bytes) seal operations in 1000641us (267078.8 ops/sec): 17.1 MB/s
Did 253250 AES-128-GCM (96 bytes) seal operations in 1000915us (253018.5 ops/sec): 24.3 MB/s
Did 248000 AES-128-GCM (112 bytes) seal operations in 1000091us (247977.4 ops/sec): 27.8 MB/s
Did 249000 AES-128-GCM (128 bytes) seal operations in 1000794us (248802.5 ops/sec): 31.8 MB/s

Penryn (Mac mini, mid 2010)
Before:
Did 1331000 AES-128-GCM (16 bytes) seal operations in 1000263us (1330650.0 ops/sec): 21.3 MB/s
Did 991000 AES-128-GCM (32 bytes) seal operations in 1000274us (990728.5 ops/sec): 31.7 MB/s
Did 780000 AES-128-GCM (48 bytes) seal operations in 1000278us (779783.2 ops/sec): 37.4 MB/s
Did 483000 AES-128-GCM (96 bytes) seal operations in 1000137us (482933.8 ops/sec): 46.4 MB/s
Did 428000 AES-128-GCM (112 bytes) seal operations in 1001132us (427516.1 ops/sec): 47.9 MB/s
Did 682000 AES-128-GCM (128 bytes) seal operations in 1000564us (681615.6 ops/sec): 87.2 MB/s
After:
Did 953000 AES-128-GCM (16 bytes) seal operations in 1000385us (952633.2 ops/sec): 15.2 MB/s
Did 903000 AES-128-GCM (32 bytes) seal operations in 1000998us (902099.7 ops/sec): 28.9 MB/s
Did 850000 AES-128-GCM (48 bytes) seal operations in 1000938us (849203.4 ops/sec): 40.8 MB/s
Did 736000 AES-128-GCM (96 bytes) seal operations in 1000886us (735348.5 ops/sec): 70.6 MB/s
Did 702000 AES-128-GCM (112 bytes) seal operations in 1000657us (701539.1 ops/sec): 78.6 MB/s
Did 676000 AES-128-GCM (128 bytes) seal operations in 1000405us (675726.3 ops/sec): 86.5 MB/s

Bug: 256
Change-Id: I9403da607dd1feaff7b3c9b76fe78b66018fb753
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/35166
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/aes/asm/bsaes-armv7.pl b/crypto/fipsmodule/aes/asm/bsaes-armv7.pl
index 11607d1..c6e0b17 100644
--- a/crypto/fipsmodule/aes/asm/bsaes-armv7.pl
+++ b/crypto/fipsmodule/aes/asm/bsaes-armv7.pl
@@ -1393,14 +1393,12 @@
 my $keysched = "sp";
 
 $code.=<<___;
-.extern	aes_nohw_encrypt
 .global	bsaes_ctr32_encrypt_blocks
 .type	bsaes_ctr32_encrypt_blocks,%function
 .align	5
 bsaes_ctr32_encrypt_blocks:
-	cmp	$len, #8			@ use plain AES for
-	blo	.Lctr_enc_short			@ small sizes
-
+	@ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this
+	@ out to retain a constant-time implementation.
 	mov	ip, sp
 	stmdb	sp!, {r4-r10, lr}
 	VFP_ABI_PUSH
@@ -1576,50 +1574,8 @@
 	VFP_ABI_POP
 	ldmia	sp!, {r4-r10, pc}	@ return
 
-.align	4
-.Lctr_enc_short:
-	ldr	ip, [sp]		@ ctr pointer is passed on stack
-	stmdb	sp!, {r4-r8, lr}
-
-	mov	r4, $inp		@ copy arguments
-	mov	r5, $out
-	mov	r6, $len
-	mov	r7, $key
-	ldr	r8, [ip, #12]		@ load counter LSW
-	vld1.8	{@XMM[1]}, [ip]		@ load whole counter value
-#ifdef __ARMEL__
-	rev	r8, r8
-#endif
-	sub	sp, sp, #0x10
-	vst1.8	{@XMM[1]}, [sp]		@ copy counter value
-	sub	sp, sp, #0x10
-
-.Lctr_enc_short_loop:
-	add	r0, sp, #0x10		@ input counter value
-	mov	r1, sp			@ output on the stack
-	mov	r2, r7			@ key
-
-	bl	aes_nohw_encrypt
-
-	vld1.8	{@XMM[0]}, [r4]!	@ load input
-	vld1.8	{@XMM[1]}, [sp]		@ load encrypted counter
-	add	r8, r8, #1
-#ifdef __ARMEL__
-	rev	r0, r8
-	str	r0, [sp, #0x1c]		@ next counter value
-#else
-	str	r8, [sp, #0x1c]		@ next counter value
-#endif
-	veor	@XMM[0],@XMM[0],@XMM[1]
-	vst1.8	{@XMM[0]}, [r5]!	@ store output
-	subs	r6, r6, #1
-	bne	.Lctr_enc_short_loop
-
-	vmov.i32	q0, #0
-	vmov.i32	q1, #0
-	vstmia		sp!, {q0-q1}
-
-	ldmia	sp!, {r4-r8, pc}
+	@ OpenSSL contains aes_nohw_* fallback code here. We patch this
+	@ out to retain a constant-time implementation.
 .size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
 ___
 }
diff --git a/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl b/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl
index 81331bf..899490f 100644
--- a/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl
+++ b/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl
@@ -811,7 +811,6 @@
 $code.=<<___;
 .text
 
-.extern	aes_nohw_encrypt
 .extern	aes_nohw_decrypt
 
 .type	_bsaes_encrypt8,\@abi-omnipotent
@@ -1968,8 +1967,8 @@
 	mov	$arg3, $len
 	mov	$arg4, $key
 	movdqa	%xmm0, 0x20(%rbp)	# copy counter
-	cmp	\$8, $arg3
-	jb	.Lctr_enc_short
+	# In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this
+	# out to retain a constant-time implementation.
 
 	mov	%eax, %ebx		# rounds
 	shl	\$7, %rax		# 128 bytes per inner round key
@@ -2103,27 +2102,9 @@
 	movdqu	0x60($inp), @XMM[14]
 	pxor	@XMM[14], @XMM[2]
 	movdqu	@XMM[2], 0x60($out)
-	jmp	.Lctr_enc_done
 
-.align	16
-.Lctr_enc_short:
-	lea	0x20(%rbp), $arg1
-	lea	0x30(%rbp), $arg2
-	lea	($key), $arg3
-	call	aes_nohw_encrypt
-	movdqu	($inp), @XMM[1]
-	lea	16($inp), $inp
-	mov	0x2c(%rbp), %eax	# load 32-bit counter
-	bswap	%eax
-	pxor	0x30(%rbp), @XMM[1]
-	inc	%eax			# increment
-	movdqu	@XMM[1], ($out)
-	bswap	%eax
-	lea	16($out), $out
-	mov	%eax, 0x2c(%rsp)	# save 32-bit counter
-	dec	$len
-	jnz	.Lctr_enc_short
-
+	# OpenSSL contains aes_nohw_* fallback code here. We patch this
+	# out to retain a constant-time implementation.
 .Lctr_enc_done:
 	lea	(%rsp), %rax
 	pxor	%xmm0, %xmm0