aes gcm: Remove Atom Silvermont optimizations.

Goldmont, the successor to Silvermont in the Atom line, added XSAVE, so
these "MOVBE without XSAVE" code paths are strictly for Silvermont. The
code paths will still work on Silvermont but with reduced performance.

Change-Id: I57f530f487e0f9b6b3f6aac912dbfaaa46628b9f
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/64788
Commit-Queue: Bob Beck <bbe@google.com>
Reviewed-by: David Benjamin <davidben@google.com>
Reviewed-by: Bob Beck <bbe@google.com>
diff --git a/crypto/fipsmodule/aes/asm/aesni-x86_64.pl b/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
index 414b76b..a8abb4d 100644
--- a/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
+++ b/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
@@ -1303,10 +1303,7 @@
 	lea	7($ctr),%r9
 	 mov	%r10d,0x60+12(%rsp)
 	bswap	%r9d
-	leaq	OPENSSL_ia32cap_P(%rip),%r10
-	 mov	4(%r10),%r10d
 	xor	$key0,%r9d
-	 and	\$`1<<26|1<<22`,%r10d		# isolate XSAVE+MOVBE
 	mov	%r9d,0x70+12(%rsp)
 
 	$movkey	0x10($key),$rndkey1
@@ -1317,104 +1314,10 @@
 	cmp	\$8,$len		# $len is in blocks
 	jb	.Lctr32_tail		# short input if ($len<8)
 
-	sub	\$6,$len		# $len is biased by -6
-	cmp	\$`1<<22`,%r10d		# check for MOVBE without XSAVE
-	je	.Lctr32_6x		# [which denotes Atom Silvermont]
-
 	lea	0x80($key),$key		# size optimization
-	sub	\$2,$len		# $len is biased by -8
+	sub	\$8,$len		# $len is biased by -8
 	jmp	.Lctr32_loop8
 
-.align	16
-.Lctr32_6x:
-	shl	\$4,$rounds
-	mov	\$48,$rnds_
-	bswap	$key0
-	lea	32($key,$rounds),$key	# end of key schedule
-	sub	%rax,%r10		# twisted $rounds
-	jmp	.Lctr32_loop6
-
-.align	16
-.Lctr32_loop6:
-	 add	\$6,$ctr		# next counter value
-	$movkey	-48($key,$rnds_),$rndkey0
-	aesenc	$rndkey1,$inout0
-	 mov	$ctr,%eax
-	 xor	$key0,%eax
-	aesenc	$rndkey1,$inout1
-	 movbe	%eax,`0x00+12`(%rsp)	# store next counter value
-	 lea	1($ctr),%eax
-	aesenc	$rndkey1,$inout2
-	 xor	$key0,%eax
-	 movbe	%eax,`0x10+12`(%rsp)
-	aesenc	$rndkey1,$inout3
-	 lea	2($ctr),%eax
-	 xor	$key0,%eax
-	aesenc	$rndkey1,$inout4
-	 movbe	%eax,`0x20+12`(%rsp)
-	 lea	3($ctr),%eax
-	aesenc	$rndkey1,$inout5
-	$movkey	-32($key,$rnds_),$rndkey1
-	 xor	$key0,%eax
-
-	aesenc	$rndkey0,$inout0
-	 movbe	%eax,`0x30+12`(%rsp)
-	 lea	4($ctr),%eax
-	aesenc	$rndkey0,$inout1
-	 xor	$key0,%eax
-	 movbe	%eax,`0x40+12`(%rsp)
-	aesenc	$rndkey0,$inout2
-	 lea	5($ctr),%eax
-	 xor	$key0,%eax
-	aesenc	$rndkey0,$inout3
-	 movbe	%eax,`0x50+12`(%rsp)
-	 mov	%r10,%rax		# mov	$rnds_,$rounds
-	aesenc	$rndkey0,$inout4
-	aesenc	$rndkey0,$inout5
-	$movkey	-16($key,$rnds_),$rndkey0
-
-	call	.Lenc_loop6
-
-	movdqu	($inp),$inout6		# load 6 input blocks
-	movdqu	0x10($inp),$inout7
-	movdqu	0x20($inp),$in0
-	movdqu	0x30($inp),$in1
-	movdqu	0x40($inp),$in2
-	movdqu	0x50($inp),$in3
-	lea	0x60($inp),$inp		# $inp+=6*16
-	$movkey	-64($key,$rnds_),$rndkey1
-	pxor	$inout0,$inout6		# inp^=E(ctr)
-	movaps	0x00(%rsp),$inout0	# load next counter [xor-ed with 0 round]
-	pxor	$inout1,$inout7
-	movaps	0x10(%rsp),$inout1
-	pxor	$inout2,$in0
-	movaps	0x20(%rsp),$inout2
-	pxor	$inout3,$in1
-	movaps	0x30(%rsp),$inout3
-	pxor	$inout4,$in2
-	movaps	0x40(%rsp),$inout4
-	pxor	$inout5,$in3
-	movaps	0x50(%rsp),$inout5
-	movdqu	$inout6,($out)		# store 6 output blocks
-	movdqu	$inout7,0x10($out)
-	movdqu	$in0,0x20($out)
-	movdqu	$in1,0x30($out)
-	movdqu	$in2,0x40($out)
-	movdqu	$in3,0x50($out)
-	lea	0x60($out),$out		# $out+=6*16
-
-	sub	\$6,$len
-	jnc	.Lctr32_loop6		# loop if $len-=6 didn't borrow
-
-	add	\$6,$len		# restore real remaining $len
-	jz	.Lctr32_done		# done if ($len==0)
-
-	lea	-48($rnds_),$rounds
-	lea	-80($key,$rnds_),$key	# restore $key
-	neg	$rounds
-	shr	\$4,$rounds		# restore $rounds
-	jmp	.Lctr32_tail
-
 .align	32
 .Lctr32_loop8:
 	 add		\$8,$ctr		# next counter value
@@ -2906,16 +2809,10 @@
 	movdqa	$inout3,$in3
 	movdqu	0x50($inp),$inout5
 	movdqa	$inout4,$in4
-	leaq	OPENSSL_ia32cap_P(%rip),%r9
-	mov	4(%r9),%r9d
 	cmp	\$0x70,$len
 	jbe	.Lcbc_dec_six_or_seven
 
-	and	\$`1<<26|1<<22`,%r9d	# isolate XSAVE+MOVBE
-	sub	\$0x50,$len		# $len is biased by -5*16
-	cmp	\$`1<<22`,%r9d		# check for MOVBE without XSAVE
-	je	.Lcbc_dec_loop6_enter	# [which denotes Atom Silvermont]
-	sub	\$0x20,$len		# $len is biased by -7*16
+	sub	\$0x70,$len		# $len is biased by -7*16
 	lea	0x70($key),$key		# size optimization
 	jmp	.Lcbc_dec_loop8_enter
 .align	16
@@ -3107,51 +3004,6 @@
 	 pxor	$inout7,$inout7
 	jmp	.Lcbc_dec_tail_collected
 
-.align	16
-.Lcbc_dec_loop6:
-	movups	$inout5,($out)
-	lea	0x10($out),$out
-	movdqu	0x00($inp),$inout0	# load input
-	movdqu	0x10($inp),$inout1
-	movdqa	$inout0,$in0
-	movdqu	0x20($inp),$inout2
-	movdqa	$inout1,$in1
-	movdqu	0x30($inp),$inout3
-	movdqa	$inout2,$in2
-	movdqu	0x40($inp),$inout4
-	movdqa	$inout3,$in3
-	movdqu	0x50($inp),$inout5
-	movdqa	$inout4,$in4
-.Lcbc_dec_loop6_enter:
-	lea	0x60($inp),$inp
-	movdqa	$inout5,$inout6
-
-	call	_aesni_decrypt6
-
-	pxor	$iv,$inout0		# ^= IV
-	movdqa	$inout6,$iv
-	pxor	$in0,$inout1
-	movdqu	$inout0,($out)
-	pxor	$in1,$inout2
-	movdqu	$inout1,0x10($out)
-	pxor	$in2,$inout3
-	movdqu	$inout2,0x20($out)
-	pxor	$in3,$inout4
-	mov	$key_,$key
-	movdqu	$inout3,0x30($out)
-	pxor	$in4,$inout5
-	mov	$rnds_,$rounds
-	movdqu	$inout4,0x40($out)
-	lea	0x50($out),$out
-	sub	\$0x60,$len
-	ja	.Lcbc_dec_loop6
-
-	movdqa	$inout5,$inout0
-	add	\$0x50,$len
-	jle	.Lcbc_dec_clear_tail_collected
-	movups	$inout5,($out)
-	lea	0x10($out),$out
-
 .Lcbc_dec_tail:
 	movups	($inp),$inout0
 	sub	\$0x10,$len
diff --git a/crypto/fipsmodule/modes/asm/ghash-x86_64.pl b/crypto/fipsmodule/modes/asm/ghash-x86_64.pl
index 8ad1104..33ee1cb 100644
--- a/crypto/fipsmodule/modes/asm/ghash-x86_64.pl
+++ b/crypto/fipsmodule/modes/asm/ghash-x86_64.pl
@@ -120,7 +120,6 @@
 
 $code=<<___;
 .text
-.extern	OPENSSL_ia32cap_P
 ___
 
 
@@ -387,15 +386,9 @@
 my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
 
 $code.=<<___;
-	leaq		OPENSSL_ia32cap_P(%rip),%rax
-	mov		4(%rax),%eax
 	cmp		\$0x30,$len
 	jb		.Lskip4x
 
-	and		\$`1<<26|1<<22`,%eax	# isolate MOVBE+XSAVE
-	cmp		\$`1<<22`,%eax		# check for MOVBE without XSAVE
-	je		.Lskip4x
-
 	sub		\$0x30,$len
 	mov		\$0xA040608020C0E000,%rax	# ((7..0)·0xE0)&0xff
 	movdqu		0x30($Htbl),$Hkey3