chacha: Move 32-bit Arm CPU dispatch from assembly to C

This also removes handling of the empty input, to match what was done
for aarch64. (The C code ensures the function is never called in this
case.)

Bug: 673
Change-Id: I7e868a9eb0b022c22c3f4ba2c8782ae1464c5a52
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/64967
Auto-Submit: David Benjamin <davidben@google.com>
Reviewed-by: Bob Beck <bbe@google.com>
Commit-Queue: Bob Beck <bbe@google.com>
diff --git a/crypto/chacha/asm/chacha-armv4.pl b/crypto/chacha/asm/chacha-armv4.pl
index 24fbb84..fd92fdb 100755
--- a/crypto/chacha/asm/chacha-armv4.pl
+++ b/crypto/chacha/asm/chacha-armv4.pl
@@ -196,39 +196,14 @@
 .long	0x61707865,0x3320646e,0x79622d32,0x6b206574	@ endian-neutral
 .Lone:
 .long	1,0,0,0
-#if __ARM_MAX_ARCH__>=7
-.LOPENSSL_armcap:
-.word   OPENSSL_armcap_P-.Lsigma
-#else
-.word	-1
-#endif
 
-.globl	ChaCha20_ctr32
-.type	ChaCha20_ctr32,%function
+.globl	ChaCha20_ctr32_nohw
+.type	ChaCha20_ctr32_nohw,%function
 .align	5
-ChaCha20_ctr32:
-.LChaCha20_ctr32:
+ChaCha20_ctr32_nohw:
 	ldr	r12,[sp,#0]		@ pull pointer to counter and nonce
 	stmdb	sp!,{r0-r2,r4-r11,lr}
 	adr	r14,.Lsigma
-	cmp	r2,#0			@ len==0?
-#ifdef	__thumb2__
-	itt	eq
-#endif
-	addeq	sp,sp,#4*3
-	beq	.Lno_data
-#if __ARM_MAX_ARCH__>=7
-	cmp	r2,#192			@ test len
-	bls	.Lshort
-	ldr	r4,[r14,#32]
-	ldr	r4,[r14,r4]
-# ifdef	__APPLE__
-	ldr	r4,[r4]
-# endif
-	tst	r4,#ARMV7_NEON
-	bne	.LChaCha20_neon
-.Lshort:
-#endif
 	ldmia	r12,{r4-r7}		@ load counter and nonce
 	sub	sp,sp,#4*(16)		@ off-load area
 	stmdb	sp!,{r4-r7}		@ copy counter and nonce
@@ -621,9 +596,8 @@
 
 .Ldone:
 	add	sp,sp,#4*(32+3)
-.Lno_data:
 	ldmia	sp!,{r4-r11,pc}
-.size	ChaCha20_ctr32,.-ChaCha20_ctr32
+.size	ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw
 ___
 
 {{{
@@ -665,12 +639,12 @@
 .arch	armv7-a
 .fpu	neon
 
-.type	ChaCha20_neon,%function
+.globl	ChaCha20_ctr32_neon
+.type	ChaCha20_ctr32_neon,%function
 .align	5
-ChaCha20_neon:
+ChaCha20_ctr32_neon:
 	ldr		r12,[sp,#0]		@ pull pointer to counter and nonce
 	stmdb		sp!,{r0-r2,r4-r11,lr}
-.LChaCha20_neon:
 	adr		r14,.Lsigma
 	vstmdb		sp!,{d8-d15}		@ ABI spec says so
 	stmdb		sp!,{r0-r3}
@@ -1145,8 +1119,7 @@
 	vldmia		sp,{d8-d15}
 	add		sp,sp,#4*(16+3)
 	ldmia		sp!,{r4-r11,pc}
-.size	ChaCha20_neon,.-ChaCha20_neon
-.comm	OPENSSL_armcap_P,4,4
+.size	ChaCha20_ctr32_neon,.-ChaCha20_ctr32_neon
 #endif
 ___
 }}}
diff --git a/crypto/chacha/internal.h b/crypto/chacha/internal.h
index a8ae3cb..d31a044 100644
--- a/crypto/chacha/internal.h
+++ b/crypto/chacha/internal.h
@@ -30,11 +30,12 @@
                       const uint8_t nonce[16]);
 
 #if !defined(OPENSSL_NO_ASM) && \
-    (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || defined(OPENSSL_ARM))
+    (defined(OPENSSL_X86) || defined(OPENSSL_X86_64))
 
 #define CHACHA20_ASM
 
-#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64)
+#elif !defined(OPENSSL_NO_ASM) && \
+    (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64))
 
 #define CHACHA20_ASM_NOHW