Revert "Import chacha-x86.pl fix."
This reverts commit 762e1d039c1d85e4651700eed82801878a9a86bc. We no longer need
to support out < in. Better to keep the assembly aligned with upstream.
Change-Id: I345bf822953bd0e1e79ad5ab4d337dcb22e7676b
Reviewed-on: https://boringssl-review.googlesource.com/8232
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/chacha/asm/chacha-x86.pl b/crypto/chacha/asm/chacha-x86.pl
index e576029..edce43d 100755
--- a/crypto/chacha/asm/chacha-x86.pl
+++ b/crypto/chacha/asm/chacha-x86.pl
@@ -19,13 +19,13 @@
# P4 18.6/+84%
# Core2 9.56/+89% 4.83
# Westmere 9.50/+45% 3.35
-# Sandy Bridge 10.7/+47% 3.24
-# Haswell 8.22/+50% 2.89
-# Silvermont 17.8/+36% 8.53
+# Sandy Bridge 10.5/+47% 3.20
+# Haswell 8.15/+50% 2.83
+# Silvermont 17.4/+36% 8.35
# Sledgehammer 10.2/+54%
-# Bulldozer 13.5/+50% 4.39(*)
+# Bulldozer 13.4/+50% 4.38(*)
#
-# (*) Bulldozer actually executes 4xXOP code path that delivers 3.50;
+# (*) Bulldozer actually executes 4xXOP code path that delivers 3.55;
#
# Modified from upstream OpenSSL to remove the XOP code.
@@ -224,20 +224,18 @@
&xor ($a, &DWP(4*0,$b)); # xor with input
&xor ($b_,&DWP(4*4,$b));
- &mov (&DWP(4*0,"esp"),$a); # off-load for later write
+ &mov (&DWP(4*0,"esp"),$a);
&mov ($a,&wparam(0)); # load output pointer
&xor ($c, &DWP(4*8,$b));
&xor ($c_,&DWP(4*9,$b));
&xor ($d, &DWP(4*12,$b));
&xor ($d_,&DWP(4*14,$b));
- &mov (&DWP(4*4,"esp"),$b_);
- &mov ($b_,&DWP(4*0,"esp"));
- &mov (&DWP(4*8,"esp"),$c);
- &mov (&DWP(4*9,"esp"),$c_);
- &mov (&DWP(4*12,"esp"),$d);
- &mov (&DWP(4*14,"esp"),$d_);
+ &mov (&DWP(4*4,$a),$b_); # write output
+ &mov (&DWP(4*8,$a),$c);
+ &mov (&DWP(4*9,$a),$c_);
+ &mov (&DWP(4*12,$a),$d);
+ &mov (&DWP(4*14,$a),$d_);
- &mov (&DWP(4*0,$a),$b_); # write output in order
&mov ($b_,&DWP(4*1,"esp"));
&mov ($c, &DWP(4*2,"esp"));
&mov ($c_,&DWP(4*3,"esp"));
@@ -254,45 +252,35 @@
&xor ($d, &DWP(4*5,$b));
&xor ($d_,&DWP(4*6,$b));
&mov (&DWP(4*1,$a),$b_);
- &mov ($b_,&DWP(4*4,"esp"));
&mov (&DWP(4*2,$a),$c);
&mov (&DWP(4*3,$a),$c_);
- &mov (&DWP(4*4,$a),$b_);
&mov (&DWP(4*5,$a),$d);
&mov (&DWP(4*6,$a),$d_);
- &mov ($c,&DWP(4*7,"esp"));
- &mov ($d,&DWP(4*8,"esp"));
- &mov ($d_,&DWP(4*9,"esp"));
- &add ($c,&DWP(64+4*7,"esp"));
- &mov ($b_, &DWP(4*10,"esp"));
- &xor ($c,&DWP(4*7,$b));
+ &mov ($b_,&DWP(4*7,"esp"));
+ &mov ($c, &DWP(4*10,"esp"));
&mov ($c_,&DWP(4*11,"esp"));
- &mov (&DWP(4*7,$a),$c);
- &mov (&DWP(4*8,$a),$d);
- &mov (&DWP(4*9,$a),$d_);
-
- &add ($b_, &DWP(64+4*10,"esp"));
- &add ($c_,&DWP(64+4*11,"esp"));
- &xor ($b_, &DWP(4*10,$b));
- &xor ($c_,&DWP(4*11,$b));
- &mov (&DWP(4*10,$a),$b_);
- &mov (&DWP(4*11,$a),$c_);
-
- &mov ($c,&DWP(4*12,"esp"));
- &mov ($c_,&DWP(4*14,"esp"));
&mov ($d, &DWP(4*13,"esp"));
&mov ($d_,&DWP(4*15,"esp"));
+ &add ($b_,&DWP(64+4*7,"esp"));
+ &add ($c, &DWP(64+4*10,"esp"));
+ &add ($c_,&DWP(64+4*11,"esp"));
&add ($d, &DWP(64+4*13,"esp"));
&add ($d_,&DWP(64+4*15,"esp"));
+ &xor ($b_,&DWP(4*7,$b));
+ &xor ($c, &DWP(4*10,$b));
+ &xor ($c_,&DWP(4*11,$b));
&xor ($d, &DWP(4*13,$b));
&xor ($d_,&DWP(4*15,$b));
&lea ($b,&DWP(4*16,$b));
- &mov (&DWP(4*12,$a),$c);
+ &mov (&DWP(4*7,$a),$b_);
+ &mov ($b_,&DWP(4*0,"esp"));
+ &mov (&DWP(4*10,$a),$c);
&mov ($c,&wparam(2)); # len
+ &mov (&DWP(4*11,$a),$c_);
&mov (&DWP(4*13,$a),$d);
- &mov (&DWP(4*14,$a),$c_);
&mov (&DWP(4*15,$a),$d_);
+ &mov (&DWP(4*0,$a),$b_);
&lea ($a,&DWP(4*16,$a));
&sub ($c,64);
&jnz (&label("outer_loop"));
@@ -567,12 +555,12 @@
my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7));
- for($i=0;$i<256;$i+=64) {
- #&movdqa ($xa0,&QWP($i+16*0-128,"ebx")); # it's there
- &movdqa ($xa1,&QWP($i+16*1-128,"ebx"));
- &movdqa ($xa2,&QWP($i+16*2-128,"ebx"));
- &movdqa ($xa3,&QWP($i+16*3-128,"ebx"));
+ #&movdqa ($xa0,&QWP(16*0-128,"ebx")); # it's there
+ &movdqa ($xa1,&QWP(16*1-128,"ebx"));
+ &movdqa ($xa2,&QWP(16*2-128,"ebx"));
+ &movdqa ($xa3,&QWP(16*3-128,"ebx"));
+ for($i=0;$i<256;$i+=64) {
&paddd ($xa0,&QWP($i+16*0-128,"ebp")); # accumulate key material
&paddd ($xa1,&QWP($i+16*1-128,"ebp"));
&paddd ($xa2,&QWP($i+16*2-128,"ebp"));
@@ -593,29 +581,25 @@
#($xa2,$xt2)=($xt2,$xa2);
- &movdqa (&QWP($i+16*0-128,"ebx"),$xa0);
+ &movdqu ($xt0,&QWP(64*0-128,$inp)); # load input
+ &movdqu ($xt1,&QWP(64*1-128,$inp));
+ &movdqu ($xa2,&QWP(64*2-128,$inp));
+ &movdqu ($xt3,&QWP(64*3-128,$inp));
+ &lea ($inp,&QWP($i<192?16:(64*4-16*3),$inp));
+ &pxor ($xt0,$xa0);
&movdqa ($xa0,&QWP($i+16*4-128,"ebx")) if ($i<192);
- &movdqa (&QWP($i+16*1-128,"ebx"),$xa1);
- &movdqa (&QWP($i+16*2-128,"ebx"),$xt2);
- &movdqa (&QWP($i+16*3-128,"ebx"),$xa3);
+ &pxor ($xt1,$xa1);
+ &movdqa ($xa1,&QWP($i+16*5-128,"ebx")) if ($i<192);
+ &pxor ($xt2,$xa2);
+ &movdqa ($xa2,&QWP($i+16*6-128,"ebx")) if ($i<192);
+ &pxor ($xt3,$xa3);
+ &movdqa ($xa3,&QWP($i+16*7-128,"ebx")) if ($i<192);
+ &movdqu (&QWP(64*0-128,$out),$xt0); # store output
+ &movdqu (&QWP(64*1-128,$out),$xt1);
+ &movdqu (&QWP(64*2-128,$out),$xt2);
+ &movdqu (&QWP(64*3-128,$out),$xt3);
+ &lea ($out,&QWP($i<192?16:(64*4-16*3),$out));
}
- for($i=0;$i<256;$i+=64) {
- my $j = 16*($i/64);
- &movdqu ($xa0,&QWP($i+16*0-128,$inp)); # load input
- &movdqu ($xa1,&QWP($i+16*1-128,$inp));
- &movdqu ($xa2,&QWP($i+16*2-128,$inp));
- &movdqu ($xa3,&QWP($i+16*3-128,$inp));
- &pxor ($xa0,&QWP($j+64*0-128,"ebx"));
- &pxor ($xa1,&QWP($j+64*1-128,"ebx"));
- &pxor ($xa2,&QWP($j+64*2-128,"ebx"));
- &pxor ($xa3,&QWP($j+64*3-128,"ebx"));
- &movdqu (&QWP($i+16*0-128,$out),$xa0); # write output
- &movdqu (&QWP($i+16*1-128,$out),$xa1);
- &movdqu (&QWP($i+16*2-128,$out),$xa2);
- &movdqu (&QWP($i+16*3-128,$out),$xa3);
- }
- &lea ($inp,&DWP(256,$inp));
- &lea ($out,&DWP(256,$out));
&sub ($len,64*4);
&jnc (&label("outer_loop"));