poly1305/asm/poly1305-*.pl: flip horizontal add and reduction.

Only the 32-bit AVX2 code path needs this, but upstream choose to harmonize all
vector code paths.

RT#4346

(Imported from 1ea8ae5090f557fea2e5b4d5758b10566825d74b.)

Tested the new code manually on arm and aarch64, NEON and non-NEON. Steven
reports that all variants pass on x86 and x86-64 too.

I've left the 32-bit x86 AVX2 code disabled since valgrind can't measure the
code coverage, but this avoids diff with upstream. We can enable it if we ever
end up caring.

Change-Id: Id9becc2adfbe44b84764f8e9c1fb5e8349c4d5a8
Reviewed-on: https://boringssl-review.googlesource.com/7295
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/poly1305/asm/poly1305-armv4.pl b/crypto/poly1305/asm/poly1305-armv4.pl
index 7470c9e..9b765ce 100755
--- a/crypto/poly1305/asm/poly1305-armv4.pl
+++ b/crypto/poly1305/asm/poly1305-armv4.pl
@@ -1057,6 +1057,15 @@
 
 .Lshort_tail:
 	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ horizontal addition
+
+	vadd.i64	$D3#lo,$D3#lo,$D3#hi
+	vadd.i64	$D0#lo,$D0#lo,$D0#hi
+	vadd.i64	$D4#lo,$D4#lo,$D4#hi
+	vadd.i64	$D1#lo,$D1#lo,$D1#hi
+	vadd.i64	$D2#lo,$D2#lo,$D2#hi
+
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
 	@ lazy reduction, but without narrowing
 
 	vshr.u64	$T0,$D3,#26
@@ -1086,15 +1095,6 @@
 	vadd.i64	$D1,$D1,$T0		@ h0 -> h1
 	 vadd.i64	$D4,$D4,$T1		@ h3 -> h4
 
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ horizontal addition
-
-	vadd.i64	$D2#lo,$D2#lo,$D2#hi
-	vadd.i64	$D0#lo,$D0#lo,$D0#hi
-	vadd.i64	$D3#lo,$D3#lo,$D3#hi
-	vadd.i64	$D1#lo,$D1#lo,$D1#hi
-	vadd.i64	$D4#lo,$D4#lo,$D4#hi
-
 	cmp		$len,#0
 	bne		.Leven
 
diff --git a/crypto/poly1305/asm/poly1305-armv8.pl b/crypto/poly1305/asm/poly1305-armv8.pl
index 96be485..1d9a81b 100755
--- a/crypto/poly1305/asm/poly1305-armv8.pl
+++ b/crypto/poly1305/asm/poly1305-armv8.pl
@@ -791,6 +791,19 @@
 
 .Lshort_tail:
 	////////////////////////////////////////////////////////////////
+	// horizontal add
+
+	addp	$ACC3,$ACC3,$ACC3
+	 ldp	d8,d9,[sp,#16]		// meet ABI requirements
+	addp	$ACC0,$ACC0,$ACC0
+	 ldp	d10,d11,[sp,#32]
+	addp	$ACC4,$ACC4,$ACC4
+	 ldp	d12,d13,[sp,#48]
+	addp	$ACC1,$ACC1,$ACC1
+	 ldp	d14,d15,[sp,#64]
+	addp	$ACC2,$ACC2,$ACC2
+
+	////////////////////////////////////////////////////////////////
 	// lazy reduction, but without narrowing
 
 	ushr	$T0.2d,$ACC3,#26
@@ -822,19 +835,6 @@
 	 add	$ACC4,$ACC4,$T1.2d	// h3 -> h4
 
 	////////////////////////////////////////////////////////////////
-	// horizontal add
-
-	addp	$ACC2,$ACC2,$ACC2
-	 ldp	d8,d9,[sp,#16]		// meet ABI requirements
-	addp	$ACC0,$ACC0,$ACC0
-	 ldp	d10,d11,[sp,#32]
-	addp	$ACC1,$ACC1,$ACC1
-	 ldp	d12,d13,[sp,#48]
-	addp	$ACC3,$ACC3,$ACC3
-	 ldp	d14,d15,[sp,#64]
-	addp	$ACC4,$ACC4,$ACC4
-
-	////////////////////////////////////////////////////////////////
 	// write the result, can be partially reduced
 
 	st4	{$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
diff --git a/crypto/poly1305/asm/poly1305-x86.pl b/crypto/poly1305/asm/poly1305-x86.pl
index 510358b..75f6646 100755
--- a/crypto/poly1305/asm/poly1305-x86.pl
+++ b/crypto/poly1305/asm/poly1305-x86.pl
@@ -49,8 +49,8 @@
 	&static_label("enter_emit");
 	&external_label("OPENSSL_ia32cap_P");
 
-	# This may be set to 2, but the AVX2 code doesn't work.
-	# https://rt.openssl.org/Ticket/Display.html?id=4346
+	# This may be set to 2, but valgrind can't do AVX2 on 32-bit. Without a
+	# way to verify test coverage, keep it disabled.
 	$avx = 0;
 }
 
@@ -526,6 +526,8 @@
 			     },"edx");
 
 sub lazy_reduction {
+my $extra = shift;
+
 	################################################################
 	# lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
 	# and P. Schwabe
@@ -533,6 +535,7 @@
 	 &movdqa	($T0,$D3);
 	 &pand		($D3,$MASK);
 	 &psrlq		($T0,26);
+	 &$extra	()				if (defined($extra));
 	 &paddq		($T0,$D4);			# h3 -> h4
 	&movdqa		($T1,$D0);
 	&pand		($D0,$MASK);
@@ -1081,21 +1084,21 @@
 
 &set_label("short_tail");
 
-	&lazy_reduction	();
-
 	################################################################
 	# horizontal addition
 
+	&pshufd		($T1,$D4,0b01001110);
+	&pshufd		($T0,$D3,0b01001110);
+	&paddq		($D4,$T1);
+	&paddq		($D3,$T0);
 	&pshufd		($T1,$D0,0b01001110);
 	&pshufd		($T0,$D1,0b01001110);
-	&paddd		($D0,$T1);
+	&paddq		($D0,$T1);
+	&paddq		($D1,$T0);
 	&pshufd		($T1,$D2,0b01001110);
-	&paddd		($D1,$T0);
-	&pshufd		($T0,$D3,0b01001110);
-	&paddd		($D2,$T1);
-	&pshufd		($T1,$D4,0b01001110);
-	&paddd		($D3,$T0);
-	&paddd		($D4,$T1);
+	#&paddq		($D2,$T1);
+
+	&lazy_reduction	(sub { &paddq ($D2,$T1) });
 
 &set_label("done");
 	&movd		(&DWP(-16*3+4*0,"edi"),$D0);	# store hash value
@@ -1103,8 +1106,8 @@
 	&movd		(&DWP(-16*3+4*2,"edi"),$D2);
 	&movd		(&DWP(-16*3+4*3,"edi"),$D3);
 	&movd		(&DWP(-16*3+4*4,"edi"),$D4);
-&set_label("nodata");
 	&mov	("esp","ebp");
+&set_label("nodata");
 &function_end("_poly1305_blocks_sse2");
 
 &align	(32);
@@ -1425,7 +1428,7 @@
 	&test	("eax","eax");				# is_base2_26?
 	&jz	(&label("enter_blocks"));
 
-&set_label("enter_avx2",16);
+&set_label("enter_avx2");
 	&vzeroupper	();
 
 	&call	(&label("pic_point"));
@@ -1721,31 +1724,31 @@
 
 	&vpmuladd	(sub {	my $i=shift; &QWP(4+32*$i-128,"edx");	});
 
-	&vlazy_reduction();
-
 	################################################################
 	# horizontal addition
 
+	&vpsrldq	($T0,$D4,8);
+	&vpsrldq	($T1,$D3,8);
+	&vpaddq		($D4,$D4,$T0);
 	&vpsrldq	($T0,$D0,8);
+	&vpaddq		($D3,$D3,$T1);
 	&vpsrldq	($T1,$D1,8);
 	&vpaddq		($D0,$D0,$T0);
 	&vpsrldq	($T0,$D2,8);
 	&vpaddq		($D1,$D1,$T1);
-	&vpsrldq	($T1,$D3,8);
+	&vpermq		($T1,$D4,2);			# keep folding
 	&vpaddq		($D2,$D2,$T0);
-	&vpsrldq	($T0,$D4,8);
-	&vpaddq		($D3,$D3,$T1);
-	&vpermq		($T1,$D0,2);			# keep folding
-	&vpaddq		($D4,$D4,$T0);
+	&vpermq		($T0,$D3,2);
+	&vpaddq		($D4,$D4,$T1);
+	&vpermq		($T1,$D0,2);
+	&vpaddq		($D3,$D3,$T0);
 	&vpermq		($T0,$D1,2);
 	&vpaddq		($D0,$D0,$T1);
 	&vpermq		($T1,$D2,2);
 	&vpaddq		($D1,$D1,$T0);
-	&vpermq		($T0,$D3,2);
 	&vpaddq		($D2,$D2,$T1);
-	&vpermq		($T1,$D4,2);
-	&vpaddq		($D3,$D3,$T0);
-	&vpaddq		($D4,$D4,$T1);
+
+	&vlazy_reduction();
 
 	&cmp		("ecx",0);
 	&je		(&label("done"));
@@ -1762,14 +1765,14 @@
 	&jmp		(&label("even"));
 
 &set_label("done",16);
-	&vmovd		(&DWP(-16*3+4*0,"edi"),"xmm0");	# store hash value
-	&vmovd		(&DWP(-16*3+4*1,"edi"),"xmm1");
-	&vmovd		(&DWP(-16*3+4*2,"edi"),"xmm2");
-	&vmovd		(&DWP(-16*3+4*3,"edi"),"xmm3");
-	&vmovd		(&DWP(-16*3+4*4,"edi"),"xmm4");
+	&vmovd		(&DWP(-16*3+4*0,"edi"),&X($D0));# store hash value
+	&vmovd		(&DWP(-16*3+4*1,"edi"),&X($D1));
+	&vmovd		(&DWP(-16*3+4*2,"edi"),&X($D2));
+	&vmovd		(&DWP(-16*3+4*3,"edi"),&X($D3));
+	&vmovd		(&DWP(-16*3+4*4,"edi"),&X($D4));
 	&vzeroupper	();
-&set_label("nodata");
 	&mov	("esp","ebp");
+&set_label("nodata");
 &function_end("_poly1305_blocks_avx2");
 }
 &set_label("const_sse2",64);
diff --git a/crypto/poly1305/asm/poly1305-x86_64.pl b/crypto/poly1305/asm/poly1305-x86_64.pl
index 460659a..6c332e9 100755
--- a/crypto/poly1305/asm/poly1305-x86_64.pl
+++ b/crypto/poly1305/asm/poly1305-x86_64.pl
@@ -1181,6 +1181,20 @@
 
 .Lshort_tail_avx:
 	################################################################
+	# horizontal addition
+
+	vpsrldq		\$8,$D4,$T4
+	vpsrldq		\$8,$D3,$T3
+	vpsrldq		\$8,$D1,$T1
+	vpsrldq		\$8,$D0,$T0
+	vpsrldq		\$8,$D2,$T2
+	vpaddq		$T3,$D3,$D3
+	vpaddq		$T4,$D4,$D4
+	vpaddq		$T0,$D0,$D0
+	vpaddq		$T1,$D1,$D1
+	vpaddq		$T2,$D2,$D2
+
+	################################################################
 	# lazy reduction
 
 	vpsrlq		\$26,$D3,$H3
@@ -1214,25 +1228,11 @@
 	vpand		$MASK,$D3,$D3
 	vpaddq		$H3,$D4,$D4		# h3 -> h4
 
-	################################################################
-	# horizontal addition
-
-	vpsrldq		\$8,$D2,$T2
-	vpsrldq		\$8,$D0,$T0
-	vpsrldq		\$8,$D1,$T1
-	vpsrldq		\$8,$D3,$T3
-	vpsrldq		\$8,$D4,$T4
-	vpaddq		$T2,$D2,$H2
-	vpaddq		$T0,$D0,$H0
-	vpaddq		$T1,$D1,$H1
-	vpaddq		$T3,$D3,$H3
-	vpaddq		$T4,$D4,$H4
-
-	vmovd		$H0,`4*0-48-64`($ctx)	# save partially reduced
-	vmovd		$H1,`4*1-48-64`($ctx)
-	vmovd		$H2,`4*2-48-64`($ctx)
-	vmovd		$H3,`4*3-48-64`($ctx)
-	vmovd		$H4,`4*4-48-64`($ctx)
+	vmovd		$D0,`4*0-48-64`($ctx)	# save partially reduced
+	vmovd		$D1,`4*1-48-64`($ctx)
+	vmovd		$D2,`4*2-48-64`($ctx)
+	vmovd		$D3,`4*3-48-64`($ctx)
+	vmovd		$D4,`4*4-48-64`($ctx)
 ___
 $code.=<<___	if ($win64);
 	vmovdqa		0x50(%r11),%xmm6
@@ -1871,6 +1871,31 @@
 	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
 
 	################################################################
+	# horizontal addition
+
+	vpsrldq		\$8,$D1,$T1
+	vpsrldq		\$8,$H2,$T2
+	vpsrldq		\$8,$H3,$T3
+	vpsrldq		\$8,$H4,$T4
+	vpsrldq		\$8,$H0,$T0
+	vpaddq		$T1,$D1,$D1
+	vpaddq		$T2,$H2,$H2
+	vpaddq		$T3,$H3,$H3
+	vpaddq		$T4,$H4,$H4
+	vpaddq		$T0,$H0,$H0
+
+	vpermq		\$0x2,$H3,$T3
+	vpermq		\$0x2,$H4,$T4
+	vpermq		\$0x2,$H0,$T0
+	vpermq		\$0x2,$D1,$T1
+	vpermq		\$0x2,$H2,$T2
+	vpaddq		$T3,$H3,$H3
+	vpaddq		$T4,$H4,$H4
+	vpaddq		$T0,$H0,$H0
+	vpaddq		$T1,$D1,$D1
+	vpaddq		$T2,$H2,$H2
+
+	################################################################
 	# lazy reduction
 
 	vpsrlq		\$26,$H3,$D3
@@ -1904,31 +1929,6 @@
 	vpand		$MASK,$H3,$H3
 	vpaddq		$D3,$H4,$H4		# h3 -> h4
 
-	################################################################
-	# horizontal addition
-
-	vpsrldq		\$8,$H2,$T2
-	vpsrldq		\$8,$H0,$T0
-	vpsrldq		\$8,$H1,$T1
-	vpsrldq		\$8,$H3,$T3
-	vpsrldq		\$8,$H4,$T4
-	vpaddq		$T2,$H2,$H2
-	vpaddq		$T0,$H0,$H0
-	vpaddq		$T1,$H1,$H1
-	vpaddq		$T3,$H3,$H3
-	vpaddq		$T4,$H4,$H4
-
-	vpermq		\$0x2,$H2,$T2
-	vpermq		\$0x2,$H0,$T0
-	vpermq		\$0x2,$H1,$T1
-	vpermq		\$0x2,$H3,$T3
-	vpermq		\$0x2,$H4,$T4
-	vpaddq		$T2,$H2,$H2
-	vpaddq		$T0,$H0,$H0
-	vpaddq		$T1,$H1,$H1
-	vpaddq		$T3,$H3,$H3
-	vpaddq		$T4,$H4,$H4
-
 	vmovd		%x#$H0,`4*0-48-64`($ctx)# save partially reduced
 	vmovd		%x#$H1,`4*1-48-64`($ctx)
 	vmovd		%x#$H2,`4*2-48-64`($ctx)