aes gcm: Remove Atom Silvermont optimizations. Goldmont, the successor to Silvermont in the Atom line, added XSAVE, so these "MOVBE without XSAVE" code paths are strictly for Silvermont. The code paths will still work on Silvermont but with reduced performance. Change-Id: I57f530f487e0f9b6b3f6aac912dbfaaa46628b9f Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/64788 Commit-Queue: Bob Beck <bbe@google.com> Reviewed-by: David Benjamin <davidben@google.com> Reviewed-by: Bob Beck <bbe@google.com>
diff --git a/crypto/fipsmodule/aes/asm/aesni-x86_64.pl b/crypto/fipsmodule/aes/asm/aesni-x86_64.pl index 414b76b..a8abb4d 100644 --- a/crypto/fipsmodule/aes/asm/aesni-x86_64.pl +++ b/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
@@ -1303,10 +1303,7 @@ lea 7($ctr),%r9 mov %r10d,0x60+12(%rsp) bswap %r9d - leaq OPENSSL_ia32cap_P(%rip),%r10 - mov 4(%r10),%r10d xor $key0,%r9d - and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE mov %r9d,0x70+12(%rsp) $movkey 0x10($key),$rndkey1 @@ -1317,104 +1314,10 @@ cmp \$8,$len # $len is in blocks jb .Lctr32_tail # short input if ($len<8) - sub \$6,$len # $len is biased by -6 - cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE - je .Lctr32_6x # [which denotes Atom Silvermont] - lea 0x80($key),$key # size optimization - sub \$2,$len # $len is biased by -8 + sub \$8,$len # $len is biased by -8 jmp .Lctr32_loop8 -.align 16 -.Lctr32_6x: - shl \$4,$rounds - mov \$48,$rnds_ - bswap $key0 - lea 32($key,$rounds),$key # end of key schedule - sub %rax,%r10 # twisted $rounds - jmp .Lctr32_loop6 - -.align 16 -.Lctr32_loop6: - add \$6,$ctr # next counter value - $movkey -48($key,$rnds_),$rndkey0 - aesenc $rndkey1,$inout0 - mov $ctr,%eax - xor $key0,%eax - aesenc $rndkey1,$inout1 - movbe %eax,`0x00+12`(%rsp) # store next counter value - lea 1($ctr),%eax - aesenc $rndkey1,$inout2 - xor $key0,%eax - movbe %eax,`0x10+12`(%rsp) - aesenc $rndkey1,$inout3 - lea 2($ctr),%eax - xor $key0,%eax - aesenc $rndkey1,$inout4 - movbe %eax,`0x20+12`(%rsp) - lea 3($ctr),%eax - aesenc $rndkey1,$inout5 - $movkey -32($key,$rnds_),$rndkey1 - xor $key0,%eax - - aesenc $rndkey0,$inout0 - movbe %eax,`0x30+12`(%rsp) - lea 4($ctr),%eax - aesenc $rndkey0,$inout1 - xor $key0,%eax - movbe %eax,`0x40+12`(%rsp) - aesenc $rndkey0,$inout2 - lea 5($ctr),%eax - xor $key0,%eax - aesenc $rndkey0,$inout3 - movbe %eax,`0x50+12`(%rsp) - mov %r10,%rax # mov $rnds_,$rounds - aesenc $rndkey0,$inout4 - aesenc $rndkey0,$inout5 - $movkey -16($key,$rnds_),$rndkey0 - - call .Lenc_loop6 - - movdqu ($inp),$inout6 # load 6 input blocks - movdqu 0x10($inp),$inout7 - movdqu 0x20($inp),$in0 - movdqu 0x30($inp),$in1 - movdqu 0x40($inp),$in2 - movdqu 0x50($inp),$in3 - lea 0x60($inp),$inp # $inp+=6*16 - $movkey -64($key,$rnds_),$rndkey1 - pxor $inout0,$inout6 # inp^=E(ctr) - movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round] - pxor $inout1,$inout7 - movaps 0x10(%rsp),$inout1 - pxor $inout2,$in0 - movaps 0x20(%rsp),$inout2 - pxor $inout3,$in1 - movaps 0x30(%rsp),$inout3 - pxor $inout4,$in2 - movaps 0x40(%rsp),$inout4 - pxor $inout5,$in3 - movaps 0x50(%rsp),$inout5 - movdqu $inout6,($out) # store 6 output blocks - movdqu $inout7,0x10($out) - movdqu $in0,0x20($out) - movdqu $in1,0x30($out) - movdqu $in2,0x40($out) - movdqu $in3,0x50($out) - lea 0x60($out),$out # $out+=6*16 - - sub \$6,$len - jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow - - add \$6,$len # restore real remaining $len - jz .Lctr32_done # done if ($len==0) - - lea -48($rnds_),$rounds - lea -80($key,$rnds_),$key # restore $key - neg $rounds - shr \$4,$rounds # restore $rounds - jmp .Lctr32_tail - .align 32 .Lctr32_loop8: add \$8,$ctr # next counter value @@ -2906,16 +2809,10 @@ movdqa $inout3,$in3 movdqu 0x50($inp),$inout5 movdqa $inout4,$in4 - leaq OPENSSL_ia32cap_P(%rip),%r9 - mov 4(%r9),%r9d cmp \$0x70,$len jbe .Lcbc_dec_six_or_seven - and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE - sub \$0x50,$len # $len is biased by -5*16 - cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE - je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont] - sub \$0x20,$len # $len is biased by -7*16 + sub \$0x70,$len # $len is biased by -7*16 lea 0x70($key),$key # size optimization jmp .Lcbc_dec_loop8_enter .align 16 @@ -3107,51 +3004,6 @@ pxor $inout7,$inout7 jmp .Lcbc_dec_tail_collected -.align 16 -.Lcbc_dec_loop6: - movups $inout5,($out) - lea 0x10($out),$out - movdqu 0x00($inp),$inout0 # load input - movdqu 0x10($inp),$inout1 - movdqa $inout0,$in0 - movdqu 0x20($inp),$inout2 - movdqa $inout1,$in1 - movdqu 0x30($inp),$inout3 - movdqa $inout2,$in2 - movdqu 0x40($inp),$inout4 - movdqa $inout3,$in3 - movdqu 0x50($inp),$inout5 - movdqa $inout4,$in4 -.Lcbc_dec_loop6_enter: - lea 0x60($inp),$inp - movdqa $inout5,$inout6 - - call _aesni_decrypt6 - - pxor $iv,$inout0 # ^= IV - movdqa $inout6,$iv - pxor $in0,$inout1 - movdqu $inout0,($out) - pxor $in1,$inout2 - movdqu $inout1,0x10($out) - pxor $in2,$inout3 - movdqu $inout2,0x20($out) - pxor $in3,$inout4 - mov $key_,$key - movdqu $inout3,0x30($out) - pxor $in4,$inout5 - mov $rnds_,$rounds - movdqu $inout4,0x40($out) - lea 0x50($out),$out - sub \$0x60,$len - ja .Lcbc_dec_loop6 - - movdqa $inout5,$inout0 - add \$0x50,$len - jle .Lcbc_dec_clear_tail_collected - movups $inout5,($out) - lea 0x10($out),$out - .Lcbc_dec_tail: movups ($inp),$inout0 sub \$0x10,$len
diff --git a/crypto/fipsmodule/modes/asm/ghash-x86_64.pl b/crypto/fipsmodule/modes/asm/ghash-x86_64.pl index 8ad1104..33ee1cb 100644 --- a/crypto/fipsmodule/modes/asm/ghash-x86_64.pl +++ b/crypto/fipsmodule/modes/asm/ghash-x86_64.pl
@@ -120,7 +120,6 @@ $code=<<___; .text -.extern OPENSSL_ia32cap_P ___ @@ -387,15 +386,9 @@ my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15)); $code.=<<___; - leaq OPENSSL_ia32cap_P(%rip),%rax - mov 4(%rax),%eax cmp \$0x30,$len jb .Lskip4x - and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE - cmp \$`1<<22`,%eax # check for MOVBE without XSAVE - je .Lskip4x - sub \$0x30,$len mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff movdqu 0x30($Htbl),$Hkey3