Remove XOP code from sha512-x86_64.pl.
Other XOP code was removed already.
Change-Id: I0c457effebd22f89e722653b93905a0b2e3eb5c0
Reviewed-on: https://boringssl-review.googlesource.com/c/33424
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/sha/asm/sha512-x86_64.pl b/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
index 9d53ec4..4927850 100755
--- a/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
+++ b/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
@@ -108,6 +108,8 @@
# part, body_00_15; reducing the amount of SIMD instructions
# below certain limit makes no difference/sense; to conserve
# space SHA256 XOP code path is therefore omitted;
+#
+# Modified from upstream OpenSSL to remove the XOP code.
$flavour = shift;
$output = shift;
@@ -275,9 +277,7 @@
test \$`1<<29`,%r11d # check for SHA
jnz _shaext_shortcut
___
-$code.=<<___ if ($avx && $SZ==8);
- test \$`1<<11`,%r10d # check for XOP
- jnz .Lxop_shortcut
+ # XOP codepath removed.
___
$code.=<<___ if ($avx>1);
and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
@@ -1127,399 +1127,6 @@
if ($avx) {{
######################################################################
-# XOP code path
-#
-if ($SZ==8) { # SHA512 only
-$code.=<<___;
-.type ${func}_xop,\@function,3
-.align 64
-${func}_xop:
-.cfi_startproc
-.Lxop_shortcut:
- mov %rsp,%rax # copy %rsp
-.cfi_def_cfa_register %rax
- push %rbx
-.cfi_push %rbx
- push %rbp
-.cfi_push %rbp
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
- shl \$4,%rdx # num*16
- sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
- lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
- and \$-64,%rsp # align stack frame
- mov $ctx,$_ctx # save ctx, 1st arg
- mov $inp,$_inp # save inp, 2nd arh
- mov %rdx,$_end # save end pointer, "3rd" arg
- mov %rax,$_rsp # save copy of %rsp
-.cfi_cfa_expression $_rsp,deref,+8
-___
-$code.=<<___ if ($win64);
- movaps %xmm6,16*$SZ+32(%rsp)
- movaps %xmm7,16*$SZ+48(%rsp)
- movaps %xmm8,16*$SZ+64(%rsp)
- movaps %xmm9,16*$SZ+80(%rsp)
-___
-$code.=<<___ if ($win64 && $SZ>4);
- movaps %xmm10,16*$SZ+96(%rsp)
- movaps %xmm11,16*$SZ+112(%rsp)
-___
-$code.=<<___;
-.Lprologue_xop:
-
- vzeroupper
- mov $SZ*0($ctx),$A
- mov $SZ*1($ctx),$B
- mov $SZ*2($ctx),$C
- mov $SZ*3($ctx),$D
- mov $SZ*4($ctx),$E
- mov $SZ*5($ctx),$F
- mov $SZ*6($ctx),$G
- mov $SZ*7($ctx),$H
- jmp .Lloop_xop
-___
- if ($SZ==4) { # SHA256
- my @X = map("%xmm$_",(0..3));
- my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
-
-$code.=<<___;
-.align 16
-.Lloop_xop:
- vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
- vmovdqu 0x00($inp),@X[0]
- vmovdqu 0x10($inp),@X[1]
- vmovdqu 0x20($inp),@X[2]
- vmovdqu 0x30($inp),@X[3]
- vpshufb $t3,@X[0],@X[0]
- lea $TABLE(%rip),$Tbl
- vpshufb $t3,@X[1],@X[1]
- vpshufb $t3,@X[2],@X[2]
- vpaddd 0x00($Tbl),@X[0],$t0
- vpshufb $t3,@X[3],@X[3]
- vpaddd 0x20($Tbl),@X[1],$t1
- vpaddd 0x40($Tbl),@X[2],$t2
- vpaddd 0x60($Tbl),@X[3],$t3
- vmovdqa $t0,0x00(%rsp)
- mov $A,$a1
- vmovdqa $t1,0x10(%rsp)
- mov $B,$a3
- vmovdqa $t2,0x20(%rsp)
- xor $C,$a3 # magic
- vmovdqa $t3,0x30(%rsp)
- mov $E,$a0
- jmp .Lxop_00_47
-
-.align 16
-.Lxop_00_47:
- sub \$`-16*2*$SZ`,$Tbl # size optimization
-___
-sub XOP_256_00_47 () {
-my $j = shift;
-my $body = shift;
-my @X = @_;
-my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
-
- &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
- eval(shift(@insns));
- eval(shift(@insns));
- &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
- eval(shift(@insns));
- eval(shift(@insns));
- &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vpsrld ($t0,$t0,$sigma0[2]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vpxor ($t0,$t0,$t1);
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
- eval(shift(@insns));
- eval(shift(@insns));
- &vpsrld ($t2,@X[3],$sigma1[2]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
- eval(shift(@insns));
- eval(shift(@insns));
- &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vpxor ($t3,$t3,$t2);
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vpsrldq ($t3,$t3,8);
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vpsrld ($t2,@X[0],$sigma1[2]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vpxor ($t3,$t3,$t2);
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vpslldq ($t3,$t3,8); # 22 instructions
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
- foreach (@insns) { eval; } # remaining instructions
- &vmovdqa (16*$j."(%rsp)",$t2);
-}
-
- for ($i=0,$j=0; $j<4; $j++) {
- &XOP_256_00_47($j,\&body_00_15,@X);
- push(@X,shift(@X)); # rotate(@X)
- }
- &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
- &jne (".Lxop_00_47");
-
- for ($i=0; $i<16; ) {
- foreach(body_00_15()) { eval; }
- }
-
- } else { # SHA512
- my @X = map("%xmm$_",(0..7));
- my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
-
-$code.=<<___;
-.align 16
-.Lloop_xop:
- vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
- vmovdqu 0x00($inp),@X[0]
- lea $TABLE+0x80(%rip),$Tbl # size optimization
- vmovdqu 0x10($inp),@X[1]
- vmovdqu 0x20($inp),@X[2]
- vpshufb $t3,@X[0],@X[0]
- vmovdqu 0x30($inp),@X[3]
- vpshufb $t3,@X[1],@X[1]
- vmovdqu 0x40($inp),@X[4]
- vpshufb $t3,@X[2],@X[2]
- vmovdqu 0x50($inp),@X[5]
- vpshufb $t3,@X[3],@X[3]
- vmovdqu 0x60($inp),@X[6]
- vpshufb $t3,@X[4],@X[4]
- vmovdqu 0x70($inp),@X[7]
- vpshufb $t3,@X[5],@X[5]
- vpaddq -0x80($Tbl),@X[0],$t0
- vpshufb $t3,@X[6],@X[6]
- vpaddq -0x60($Tbl),@X[1],$t1
- vpshufb $t3,@X[7],@X[7]
- vpaddq -0x40($Tbl),@X[2],$t2
- vpaddq -0x20($Tbl),@X[3],$t3
- vmovdqa $t0,0x00(%rsp)
- vpaddq 0x00($Tbl),@X[4],$t0
- vmovdqa $t1,0x10(%rsp)
- vpaddq 0x20($Tbl),@X[5],$t1
- vmovdqa $t2,0x20(%rsp)
- vpaddq 0x40($Tbl),@X[6],$t2
- vmovdqa $t3,0x30(%rsp)
- vpaddq 0x60($Tbl),@X[7],$t3
- vmovdqa $t0,0x40(%rsp)
- mov $A,$a1
- vmovdqa $t1,0x50(%rsp)
- mov $B,$a3
- vmovdqa $t2,0x60(%rsp)
- xor $C,$a3 # magic
- vmovdqa $t3,0x70(%rsp)
- mov $E,$a0
- jmp .Lxop_00_47
-
-.align 16
-.Lxop_00_47:
- add \$`16*2*$SZ`,$Tbl
-___
-sub XOP_512_00_47 () {
-my $j = shift;
-my $body = shift;
-my @X = @_;
-my @insns = (&$body,&$body); # 52 instructions
-
- &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2]
- eval(shift(@insns));
- eval(shift(@insns));
- &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10]
- eval(shift(@insns));
- eval(shift(@insns));
- &vprotq ($t1,$t0,8*$SZ-$sigma0[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vpsrlq ($t0,$t0,$sigma0[2]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10]
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vpxor ($t0,$t0,$t1);
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vpxor ($t0,$t0,$t2); # sigma0(X[1..2])
- eval(shift(@insns));
- eval(shift(@insns));
- &vpsrlq ($t2,@X[7],$sigma1[2]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2])
- eval(shift(@insns));
- eval(shift(@insns));
- &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vpxor ($t3,$t3,$t2);
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
- foreach (@insns) { eval; } # remaining instructions
- &vmovdqa (16*$j."(%rsp)",$t2);
-}
-
- for ($i=0,$j=0; $j<8; $j++) {
- &XOP_512_00_47($j,\&body_00_15,@X);
- push(@X,shift(@X)); # rotate(@X)
- }
- &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
- &jne (".Lxop_00_47");
-
- for ($i=0; $i<16; ) {
- foreach(body_00_15()) { eval; }
- }
-}
-$code.=<<___;
- mov $_ctx,$ctx
- mov $a1,$A
-
- add $SZ*0($ctx),$A
- lea 16*$SZ($inp),$inp
- add $SZ*1($ctx),$B
- add $SZ*2($ctx),$C
- add $SZ*3($ctx),$D
- add $SZ*4($ctx),$E
- add $SZ*5($ctx),$F
- add $SZ*6($ctx),$G
- add $SZ*7($ctx),$H
-
- cmp $_end,$inp
-
- mov $A,$SZ*0($ctx)
- mov $B,$SZ*1($ctx)
- mov $C,$SZ*2($ctx)
- mov $D,$SZ*3($ctx)
- mov $E,$SZ*4($ctx)
- mov $F,$SZ*5($ctx)
- mov $G,$SZ*6($ctx)
- mov $H,$SZ*7($ctx)
- jb .Lloop_xop
-
- mov $_rsp,%rsi
-.cfi_def_cfa %rsi,8
- vzeroupper
-___
-$code.=<<___ if ($win64);
- movaps 16*$SZ+32(%rsp),%xmm6
- movaps 16*$SZ+48(%rsp),%xmm7
- movaps 16*$SZ+64(%rsp),%xmm8
- movaps 16*$SZ+80(%rsp),%xmm9
-___
-$code.=<<___ if ($win64 && $SZ>4);
- movaps 16*$SZ+96(%rsp),%xmm10
- movaps 16*$SZ+112(%rsp),%xmm11
-___
-$code.=<<___;
- mov -48(%rsi),%r15
-.cfi_restore %r15
- mov -40(%rsi),%r14
-.cfi_restore %r14
- mov -32(%rsi),%r13
-.cfi_restore %r13
- mov -24(%rsi),%r12
-.cfi_restore %r12
- mov -16(%rsi),%rbp
-.cfi_restore %rbp
- mov -8(%rsi),%rbx
-.cfi_restore %rbx
- lea (%rsi),%rsp
-.cfi_def_cfa_register %rsp
-.Lepilogue_xop:
- ret
-.cfi_endproc
-.size ${func}_xop,.-${func}_xop
-___
-}
-######################################################################
# AVX+shrd code path
#
local *ror = sub { &shrd(@_[0],@_) };
@@ -2409,11 +2016,6 @@
.rva .LSEH_end_${func}_ssse3
.rva .LSEH_info_${func}_ssse3
___
-$code.=<<___ if ($avx && $SZ==8);
- .rva .LSEH_begin_${func}_xop
- .rva .LSEH_end_${func}_xop
- .rva .LSEH_info_${func}_xop
-___
$code.=<<___ if ($avx);
.rva .LSEH_begin_${func}_avx
.rva .LSEH_end_${func}_avx
@@ -2443,12 +2045,6 @@
.rva se_handler
.rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
___
-$code.=<<___ if ($avx && $SZ==8);
-.LSEH_info_${func}_xop:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
-___
$code.=<<___ if ($avx);
.LSEH_info_${func}_avx:
.byte 9,0,0,0