ghash-x86[_64].pl: ~15% improvement on Atom Silvermont

(other processors unaffected).

(Imported from upstream's 7078d93307d795cec577ec4a792b72fffed551ab)
diff --git a/crypto/modes/asm/ghash-x86.pl b/crypto/modes/asm/ghash-x86.pl
index d47e325..eb6d55e 100644
--- a/crypto/modes/asm/ghash-x86.pl
+++ b/crypto/modes/asm/ghash-x86.pl
@@ -1021,13 +1021,14 @@
 	&pshufd		($T1,$Xn,0b01001110);	# H*Ii+1
 	&movdqa		($Xhn,$Xn);
 	&pxor		($T1,$Xn);		#
+	&lea		($inp,&DWP(32,$inp));	# i+=2
 
 	&pclmulqdq	($Xn,$Hkey,0x00);	#######
 	&pclmulqdq	($Xhn,$Hkey,0x11);	#######
-	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
 	&pclmulqdq	($T1,$T3,0x00);		#######
+	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
+	&nop		();
 
-	&lea		($inp,&DWP(32,$inp));	# i+=2
 	&sub		($len,0x20);
 	&jbe		(&label("even_tail"));
 	&jmp		(&label("mod_loop"));
@@ -1036,22 +1037,23 @@
 	&pshufd		($T2,$Xi,0b01001110);	# H^2*(Ii+Xi)
 	&movdqa		($Xhi,$Xi);
 	&pxor		($T2,$Xi);		#
+	&nop		();
 
 	&pclmulqdq	($Xi,$Hkey,0x00);	#######
 	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
-	&movups		($Hkey,&QWP(0,$Htbl));	# load H
 	&pclmulqdq	($T2,$T3,0x10);		#######
-	&movdqa		($T3,&QWP(0,$const));
+	&movups		($Hkey,&QWP(0,$Htbl));	# load H
 
 	&xorps		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&movdqa		($T3,&QWP(0,$const));
 	&xorps		($Xhi,$Xhn);
 	 &movdqu	($Xhn,&QWP(0,$inp));	# Ii
 	&pxor		($T1,$Xi);		# aggregated Karatsuba post-processing
 	 &movdqu	($Xn,&QWP(16,$inp));	# Ii+1
 	&pxor		($T1,$Xhi);		#
 
-	&pxor		($T2,$T1);		#
 	 &pshufb	($Xhn,$T3);
+	&pxor		($T2,$T1);		#
 
 	&movdqa		($T1,$T2);		#
 	&psrldq		($T2,8);
@@ -1068,8 +1070,8 @@
 	  &pxor		($T1,$Xi);		#
 	  &psllq	($Xi,1);
 	  &pxor		($Xi,$T1);		#
-	&movups		($T3,&QWP(32,$Htbl));
 	&pclmulqdq	($Xn,$Hkey,0x00);	#######
+	&movups		($T3,&QWP(32,$Htbl));
 	  &psllq	($Xi,57);		#
 	  &movdqa	($T1,$Xi);		#
 	  &pslldq	($Xi,8);
@@ -1080,9 +1082,9 @@
 	  &movdqa	($T2,$Xi);		# 2nd phase
 	  &psrlq	($Xi,1);
 	&pxor		($T1,$Xhn);
+	  &pxor		($Xhi,$T2);		#
 	&pclmulqdq	($Xhn,$Hkey,0x11);	#######
 	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
-	  &pxor		($Xhi,$T2);		#
 	  &pxor		($T2,$Xi);
 	  &psrlq	($Xi,5);
 	  &pxor		($Xi,$T2);		#
diff --git a/crypto/modes/asm/ghash-x86_64.pl b/crypto/modes/asm/ghash-x86_64.pl
index 7904248..04001e6 100644
--- a/crypto/modes/asm/ghash-x86_64.pl
+++ b/crypto/modes/asm/ghash-x86_64.pl
@@ -214,6 +214,7 @@
 
 $code=<<___;
 .text
+.extern	OPENSSL_ia32cap_P
 
 .globl	gcm_gmult_4bit
 .type	gcm_gmult_4bit,\@function,2
@@ -597,7 +598,8 @@
 }
 
 { my ($Xip,$Htbl,$inp,$len)=@_4args;
-  my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(6..10));
+  my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
+  my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
 
 $code.=<<___;
 .globl	gcm_ghash_clmul
@@ -624,7 +626,6 @@
 ___
 $code.=<<___;
 	movdqa		.Lbswap_mask(%rip),$T3
-	mov		\$0xA040608020C0E000,%rax	# ((7..0)·0xE0)&0xff
 
 	movdqu		($Xip),$Xi
 	movdqu		($Htbl),$Hkey
@@ -640,10 +641,16 @@
 my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
 
 $code.=<<___;
+	mov		OPENSSL_ia32cap_P+4(%rip),%eax
 	cmp		\$0x30,$len
 	jb		.Lskip4x
 
+	and		\$`1<<26|1<<22`,%eax	# isolate MOVBE+XSAVE
+	cmp		\$`1<<22`,%eax		# check for MOVBE without XSAVE
+	je		.Lskip4x
+
 	sub		\$0x30,$len
+	mov		\$0xA040608020C0E000,%rax	# ((7..0)·0xE0)&0xff
 	movdqu		0x30($Htbl),$Hkey3
 	movdqu		0x40($Htbl),$Hkey4
 
@@ -819,51 +826,54 @@
 	pxor		$T1,$Xi			# Ii+Xi
 
 	movdqa		$Xln,$Xhn
-	pshufd		\$0b01001110,$Xln,$T1
-	pxor		$Xln,$T1
+	pshufd		\$0b01001110,$Xln,$Xmn
+	pxor		$Xln,$Xmn
 	pclmulqdq	\$0x00,$Hkey,$Xln
 	pclmulqdq	\$0x11,$Hkey,$Xhn
-	pclmulqdq	\$0x00,$HK,$T1
+	pclmulqdq	\$0x00,$HK,$Xmn
 
 	lea		32($inp),$inp		# i+=2
+	nop
 	sub		\$0x20,$len
 	jbe		.Leven_tail
+	nop
 	jmp		.Lmod_loop
 
 .align	32
 .Lmod_loop:
 	movdqa		$Xi,$Xhi
-	pshufd		\$0b01001110,$Xi,$T2	#
-	pxor		$Xi,$T2			#
+	movdqa		$Xmn,$T1
+	pshufd		\$0b01001110,$Xi,$Xmn	#
+	pxor		$Xi,$Xmn		#
 
 	pclmulqdq	\$0x00,$Hkey2,$Xi
 	pclmulqdq	\$0x11,$Hkey2,$Xhi
-	pclmulqdq	\$0x10,$HK,$T2
+	pclmulqdq	\$0x10,$HK,$Xmn
 
 	pxor		$Xln,$Xi		# (H*Ii+1) + H^2*(Ii+Xi)
 	pxor		$Xhn,$Xhi
 	  movdqu	($inp),$Xhn		# Ii
+	pxor		$Xi,$T1			# aggregated Karatsuba post-processing
 	  pshufb	$T3,$Xhn
 	  movdqu	16($inp),$Xln		# Ii+1
 
-	pxor		$Xi,$T1			# aggregated Karatsuba post-processing
 	pxor		$Xhi,$T1
 	  pxor		$Xhn,$Xhi		# "Ii+Xi", consume early
-	pxor		$T1,$T2
+	pxor		$T1,$Xmn
 	 pshufb		$T3,$Xln
-	movdqa		$T2,$T1			#
+	movdqa		$Xmn,$T1		#
 	psrldq		\$8,$T1
-	pslldq		\$8,$T2			#
+	pslldq		\$8,$Xmn		#
 	pxor		$T1,$Xhi
-	pxor		$T2,$Xi			#
+	pxor		$Xmn,$Xi		#
 
 	movdqa		$Xln,$Xhn		#
 
 	  movdqa	$Xi,$T2			# 1st phase
 	  movdqa	$Xi,$T1
 	  psllq		\$5,$Xi
-	pclmulqdq	\$0x00,$Hkey,$Xln	#######
 	  pxor		$Xi,$T1			#
+	pclmulqdq	\$0x00,$Hkey,$Xln	#######
 	  psllq		\$1,$Xi
 	  pxor		$T1,$Xi			#
 	  psllq		\$57,$Xi		#
@@ -871,9 +881,9 @@
 	  pslldq	\$8,$Xi
 	  psrldq	\$8,$T1			#	
 	  pxor		$T2,$Xi
+	pshufd		\$0b01001110,$Xhn,$Xmn
 	  pxor		$T1,$Xhi		#
-	pshufd		\$0b01001110,$Xhn,$T1
-	pxor		$Xhn,$T1		#
+	pxor		$Xhn,$Xmn		#
 
 	pclmulqdq	\$0x11,$Hkey,$Xhn	#######
 	  movdqa	$Xi,$T2			# 2nd phase
@@ -882,33 +892,35 @@
 	  pxor		$Xi,$T2
 	  psrlq		\$5,$Xi
 	  pxor		$T2,$Xi			#
-	  psrlq		\$1,$Xi			#
-	pclmulqdq	\$0x00,$HK,$T1		#######
-	  pxor		$Xhi,$Xi		#
-
 	lea		32($inp),$inp
+	  psrlq		\$1,$Xi			#
+	pclmulqdq	\$0x00,$HK,$Xmn		#######
+	  pxor		$Xhi,$Xi		#
+	  .byte		0x66,0x90
+
 	sub		\$0x20,$len
 	ja		.Lmod_loop
 
 .Leven_tail:
 	 movdqa		$Xi,$Xhi
-	 pshufd		\$0b01001110,$Xi,$T2	#
-	 pxor		$Xi,$T2			#
+	 movdqa		$Xmn,$T1
+	 pshufd		\$0b01001110,$Xi,$Xmn	#
+	 pxor		$Xi,$Xmn		#
 
 	pclmulqdq	\$0x00,$Hkey2,$Xi
 	pclmulqdq	\$0x11,$Hkey2,$Xhi
-	pclmulqdq	\$0x10,$HK,$T2
+	pclmulqdq	\$0x10,$HK,$Xmn
 
 	pxor		$Xln,$Xi		# (H*Ii+1) + H^2*(Ii+Xi)
 	pxor		$Xhn,$Xhi
 	pxor		$Xi,$T1
 	pxor		$Xhi,$T1
-	pxor		$T1,$T2
-	movdqa		$T2,$T1			#
+	pxor		$T1,$Xmn
+	movdqa		$Xmn,$T1		#
 	psrldq		\$8,$T1
-	pslldq		\$8,$T2			#
+	pslldq		\$8,$Xmn		#
 	pxor		$T1,$Xhi
-	pxor		$T2,$Xi			#
+	pxor		$Xmn,$Xi		#
 ___
 	&reduction_alg9	($Xhi,$Xi);
 $code.=<<___;