ghash-x86_64.pl: optimize for upcoming Atom.

(Imported from upstream's 66aeaec067680e4a4ab956dcf30a295aaba07e2c)
diff --git a/crypto/modes/asm/ghash-x86_64.pl b/crypto/modes/asm/ghash-x86_64.pl
index 04001e6..f1aca3e 100644
--- a/crypto/modes/asm/ghash-x86_64.pl
+++ b/crypto/modes/asm/ghash-x86_64.pl
@@ -59,11 +59,11 @@
 # longer. A CPU with higher pclmulqdq issue rate would also benefit
 # from higher aggregate factor...
 #
-# Westmere	1.76(+14%)
-# Sandy Bridge	1.79(+9%)
-# Ivy Bridge	1.79(+8%)
+# Westmere	1.78(+13%)
+# Sandy Bridge	1.80(+8%)
+# Ivy Bridge	1.80(+7%)
 # Haswell	0.55(+93%) (if system doesn't support AVX)
-# Bulldozer	1.52(+25%)
+# Bulldozer	1.49(+27%)
 
 # March 2013
 #
@@ -673,8 +673,8 @@
 	pxor		$Xl,$Xm
 	pclmulqdq	\$0x00,$Hkey2,$Xl
 	pclmulqdq	\$0x11,$Hkey2,$Xh
-	xorps		$Xl,$Xln
 	pclmulqdq	\$0x10,$HK,$Xm
+	xorps		$Xl,$Xln
 	xorps		$Xh,$Xhn
 	movups		0x50($Htbl),$HK
 	xorps		$Xm,$Xmn
@@ -692,8 +692,8 @@
 	 pshufd		\$0b01001110,$Xi,$T1
 	 pxor		$Xi,$T1
 	pclmulqdq	\$0x11,$Hkey3,$Xh
-	xorps		$Xl,$Xln
 	pclmulqdq	\$0x00,$HK,$Xm
+	xorps		$Xl,$Xln
 	xorps		$Xh,$Xhn
 
 	lea	0x40($inp),$inp
@@ -711,23 +711,23 @@
 	xorps		$Xln,$Xi
 	 movdqu		0x20($inp),$Xln
 	 movdqa		$Xl,$Xh
-	 pshufd		\$0b01001110,$Xl,$Xm
 	pclmulqdq	\$0x10,$HK,$T1
+	 pshufd		\$0b01001110,$Xl,$Xm
 	xorps		$Xhn,$Xhi
 	 pxor		$Xl,$Xm
 	 pshufb		$T3,$Xln
 	movups		0x20($Htbl),$HK
-	 pclmulqdq	\$0x00,$Hkey,$Xl
 	xorps		$Xmn,$T1
-	 movdqa		$Xln,$Xhn
+	 pclmulqdq	\$0x00,$Hkey,$Xl
 	 pshufd		\$0b01001110,$Xln,$Xmn
 
 	pxor		$Xi,$T1			# aggregated Karatsuba post-processing
-	 pxor		$Xln,$Xmn
+	 movdqa		$Xln,$Xhn
 	pxor		$Xhi,$T1		#
+	 pxor		$Xln,$Xmn
 	movdqa		$T1,$T2			#
-	pslldq		\$8,$T1
 	 pclmulqdq	\$0x11,$Hkey,$Xh
+	pslldq		\$8,$T1
 	psrldq		\$8,$T2			#
 	pxor		$T1,$Xi
 	movdqa		.L7_mask(%rip),$T1
@@ -736,8 +736,8 @@
 
 	pand		$Xi,$T1			# 1st phase
 	pshufb		$T1,$T2			#
-	 pclmulqdq	\$0x00,$HK,$Xm
 	pxor		$Xi,$T2			#
+	 pclmulqdq	\$0x00,$HK,$Xm
 	psllq		\$57,$T2		#
 	movdqa		$T2,$T1			#
 	pslldq		\$8,$T2
@@ -764,32 +764,31 @@
 	 movdqa		$Xl,$Xh
 	 pxor		$Xm,$Xmn
 	 pshufd		\$0b01001110,$Xl,$Xm
-	 pxor		$Xl,$Xm
-	 pclmulqdq	\$0x00,$Hkey3,$Xl
 	pxor		$T2,$Xi			#
 	pxor		$T1,$Xhi
+	 pxor		$Xl,$Xm
+	 pclmulqdq	\$0x00,$Hkey3,$Xl
 	psrlq		\$1,$Xi			#
+	pxor		$Xhi,$Xi		#
+	movdqa		$Xi,$Xhi
 	 pclmulqdq	\$0x11,$Hkey3,$Xh
 	 xorps		$Xl,$Xln
-	pxor		$Xhi,$Xi		#
+	pshufd		\$0b01001110,$Xi,$T1
+	pxor		$Xi,$T1
 
 	 pclmulqdq	\$0x00,$HK,$Xm
 	 xorps		$Xh,$Xhn
 
-	movdqa		$Xi,$Xhi
-	pshufd		\$0b01001110,$Xi,$T1
-	pxor		$Xi,$T1
-
 	lea	0x40($inp),$inp
 	sub	\$0x40,$len
 	jnc	.Lmod4_loop
 
 .Ltail4x:
 	pclmulqdq	\$0x00,$Hkey4,$Xi
-	xorps		$Xm,$Xmn
 	pclmulqdq	\$0x11,$Hkey4,$Xhi
-	xorps		$Xln,$Xi
 	pclmulqdq	\$0x10,$HK,$T1
+	xorps		$Xm,$Xmn
+	xorps		$Xln,$Xi
 	xorps		$Xhn,$Xhi
 	pxor		$Xi,$Xhi		# aggregated Karatsuba post-processing
 	pxor		$Xmn,$T1
@@ -852,13 +851,13 @@
 
 	pxor		$Xln,$Xi		# (H*Ii+1) + H^2*(Ii+Xi)
 	pxor		$Xhn,$Xhi
-	  movdqu	($inp),$Xhn		# Ii
+	  movdqu	($inp),$T2		# Ii
 	pxor		$Xi,$T1			# aggregated Karatsuba post-processing
-	  pshufb	$T3,$Xhn
+	  pshufb	$T3,$T2
 	  movdqu	16($inp),$Xln		# Ii+1
 
 	pxor		$Xhi,$T1
-	  pxor		$Xhn,$Xhi		# "Ii+Xi", consume early
+	  pxor		$T2,$Xhi		# "Ii+Xi", consume early
 	pxor		$T1,$Xmn
 	 pshufb		$T3,$Xln
 	movdqa		$Xmn,$T1		#
@@ -885,9 +884,9 @@
 	  pxor		$T1,$Xhi		#
 	pxor		$Xhn,$Xmn		#
 
-	pclmulqdq	\$0x11,$Hkey,$Xhn	#######
 	  movdqa	$Xi,$T2			# 2nd phase
 	  psrlq		\$1,$Xi
+	pclmulqdq	\$0x11,$Hkey,$Xhn	#######
 	  pxor		$T2,$Xhi		#
 	  pxor		$Xi,$T2
 	  psrlq		\$5,$Xi
@@ -896,7 +895,6 @@
 	  psrlq		\$1,$Xi			#
 	pclmulqdq	\$0x00,$HK,$Xmn		#######
 	  pxor		$Xhi,$Xi		#
-	  .byte		0x66,0x90
 
 	sub		\$0x20,$len
 	ja		.Lmod_loop