aes gcm: Remove Atom Silvermont optimizations.
Goldmont, the successor to Silvermont in the Atom line, added XSAVE, so
these "MOVBE without XSAVE" code paths are strictly for Silvermont. The
code paths will still work on Silvermont but with reduced performance.
Change-Id: I57f530f487e0f9b6b3f6aac912dbfaaa46628b9f
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/64788
Commit-Queue: Bob Beck <bbe@google.com>
Reviewed-by: David Benjamin <davidben@google.com>
Reviewed-by: Bob Beck <bbe@google.com>
diff --git a/crypto/fipsmodule/aes/asm/aesni-x86_64.pl b/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
index 414b76b..a8abb4d 100644
--- a/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
+++ b/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
@@ -1303,10 +1303,7 @@
lea 7($ctr),%r9
mov %r10d,0x60+12(%rsp)
bswap %r9d
- leaq OPENSSL_ia32cap_P(%rip),%r10
- mov 4(%r10),%r10d
xor $key0,%r9d
- and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
mov %r9d,0x70+12(%rsp)
$movkey 0x10($key),$rndkey1
@@ -1317,104 +1314,10 @@
cmp \$8,$len # $len is in blocks
jb .Lctr32_tail # short input if ($len<8)
- sub \$6,$len # $len is biased by -6
- cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE
- je .Lctr32_6x # [which denotes Atom Silvermont]
-
lea 0x80($key),$key # size optimization
- sub \$2,$len # $len is biased by -8
+ sub \$8,$len # $len is biased by -8
jmp .Lctr32_loop8
-.align 16
-.Lctr32_6x:
- shl \$4,$rounds
- mov \$48,$rnds_
- bswap $key0
- lea 32($key,$rounds),$key # end of key schedule
- sub %rax,%r10 # twisted $rounds
- jmp .Lctr32_loop6
-
-.align 16
-.Lctr32_loop6:
- add \$6,$ctr # next counter value
- $movkey -48($key,$rnds_),$rndkey0
- aesenc $rndkey1,$inout0
- mov $ctr,%eax
- xor $key0,%eax
- aesenc $rndkey1,$inout1
- movbe %eax,`0x00+12`(%rsp) # store next counter value
- lea 1($ctr),%eax
- aesenc $rndkey1,$inout2
- xor $key0,%eax
- movbe %eax,`0x10+12`(%rsp)
- aesenc $rndkey1,$inout3
- lea 2($ctr),%eax
- xor $key0,%eax
- aesenc $rndkey1,$inout4
- movbe %eax,`0x20+12`(%rsp)
- lea 3($ctr),%eax
- aesenc $rndkey1,$inout5
- $movkey -32($key,$rnds_),$rndkey1
- xor $key0,%eax
-
- aesenc $rndkey0,$inout0
- movbe %eax,`0x30+12`(%rsp)
- lea 4($ctr),%eax
- aesenc $rndkey0,$inout1
- xor $key0,%eax
- movbe %eax,`0x40+12`(%rsp)
- aesenc $rndkey0,$inout2
- lea 5($ctr),%eax
- xor $key0,%eax
- aesenc $rndkey0,$inout3
- movbe %eax,`0x50+12`(%rsp)
- mov %r10,%rax # mov $rnds_,$rounds
- aesenc $rndkey0,$inout4
- aesenc $rndkey0,$inout5
- $movkey -16($key,$rnds_),$rndkey0
-
- call .Lenc_loop6
-
- movdqu ($inp),$inout6 # load 6 input blocks
- movdqu 0x10($inp),$inout7
- movdqu 0x20($inp),$in0
- movdqu 0x30($inp),$in1
- movdqu 0x40($inp),$in2
- movdqu 0x50($inp),$in3
- lea 0x60($inp),$inp # $inp+=6*16
- $movkey -64($key,$rnds_),$rndkey1
- pxor $inout0,$inout6 # inp^=E(ctr)
- movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round]
- pxor $inout1,$inout7
- movaps 0x10(%rsp),$inout1
- pxor $inout2,$in0
- movaps 0x20(%rsp),$inout2
- pxor $inout3,$in1
- movaps 0x30(%rsp),$inout3
- pxor $inout4,$in2
- movaps 0x40(%rsp),$inout4
- pxor $inout5,$in3
- movaps 0x50(%rsp),$inout5
- movdqu $inout6,($out) # store 6 output blocks
- movdqu $inout7,0x10($out)
- movdqu $in0,0x20($out)
- movdqu $in1,0x30($out)
- movdqu $in2,0x40($out)
- movdqu $in3,0x50($out)
- lea 0x60($out),$out # $out+=6*16
-
- sub \$6,$len
- jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow
-
- add \$6,$len # restore real remaining $len
- jz .Lctr32_done # done if ($len==0)
-
- lea -48($rnds_),$rounds
- lea -80($key,$rnds_),$key # restore $key
- neg $rounds
- shr \$4,$rounds # restore $rounds
- jmp .Lctr32_tail
-
.align 32
.Lctr32_loop8:
add \$8,$ctr # next counter value
@@ -2906,16 +2809,10 @@
movdqa $inout3,$in3
movdqu 0x50($inp),$inout5
movdqa $inout4,$in4
- leaq OPENSSL_ia32cap_P(%rip),%r9
- mov 4(%r9),%r9d
cmp \$0x70,$len
jbe .Lcbc_dec_six_or_seven
- and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE
- sub \$0x50,$len # $len is biased by -5*16
- cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE
- je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont]
- sub \$0x20,$len # $len is biased by -7*16
+ sub \$0x70,$len # $len is biased by -7*16
lea 0x70($key),$key # size optimization
jmp .Lcbc_dec_loop8_enter
.align 16
@@ -3107,51 +3004,6 @@
pxor $inout7,$inout7
jmp .Lcbc_dec_tail_collected
-.align 16
-.Lcbc_dec_loop6:
- movups $inout5,($out)
- lea 0x10($out),$out
- movdqu 0x00($inp),$inout0 # load input
- movdqu 0x10($inp),$inout1
- movdqa $inout0,$in0
- movdqu 0x20($inp),$inout2
- movdqa $inout1,$in1
- movdqu 0x30($inp),$inout3
- movdqa $inout2,$in2
- movdqu 0x40($inp),$inout4
- movdqa $inout3,$in3
- movdqu 0x50($inp),$inout5
- movdqa $inout4,$in4
-.Lcbc_dec_loop6_enter:
- lea 0x60($inp),$inp
- movdqa $inout5,$inout6
-
- call _aesni_decrypt6
-
- pxor $iv,$inout0 # ^= IV
- movdqa $inout6,$iv
- pxor $in0,$inout1
- movdqu $inout0,($out)
- pxor $in1,$inout2
- movdqu $inout1,0x10($out)
- pxor $in2,$inout3
- movdqu $inout2,0x20($out)
- pxor $in3,$inout4
- mov $key_,$key
- movdqu $inout3,0x30($out)
- pxor $in4,$inout5
- mov $rnds_,$rounds
- movdqu $inout4,0x40($out)
- lea 0x50($out),$out
- sub \$0x60,$len
- ja .Lcbc_dec_loop6
-
- movdqa $inout5,$inout0
- add \$0x50,$len
- jle .Lcbc_dec_clear_tail_collected
- movups $inout5,($out)
- lea 0x10($out),$out
-
.Lcbc_dec_tail:
movups ($inp),$inout0
sub \$0x10,$len
diff --git a/crypto/fipsmodule/modes/asm/ghash-x86_64.pl b/crypto/fipsmodule/modes/asm/ghash-x86_64.pl
index 8ad1104..33ee1cb 100644
--- a/crypto/fipsmodule/modes/asm/ghash-x86_64.pl
+++ b/crypto/fipsmodule/modes/asm/ghash-x86_64.pl
@@ -120,7 +120,6 @@
$code=<<___;
.text
-.extern OPENSSL_ia32cap_P
___
@@ -387,15 +386,9 @@
my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
$code.=<<___;
- leaq OPENSSL_ia32cap_P(%rip),%rax
- mov 4(%rax),%eax
cmp \$0x30,$len
jb .Lskip4x
- and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE
- cmp \$`1<<22`,%eax # check for MOVBE without XSAVE
- je .Lskip4x
-
sub \$0x30,$len
mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
movdqu 0x30($Htbl),$Hkey3