Remove OPENSSL_ia32cap_P references from ChaCha20-Poly1305 assembly CPU dispatch is now all moved out of assembly. Fixed: 42290548 Change-Id: Icbe5053255d4aa76406b5303ba515ec38d42cb0d Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/70809 Commit-Queue: David Benjamin <davidben@google.com> Reviewed-by: Bob Beck <bbe@google.com>
diff --git a/crypto/cipher_extra/aead_test.cc b/crypto/cipher_extra/aead_test.cc index e16f5a2..9207ccc 100644 --- a/crypto/cipher_extra/aead_test.cc +++ b/crypto/cipher_extra/aead_test.cc
@@ -825,15 +825,33 @@ for (size_t len = 0; len <= 1024; len += 5) { SCOPED_TRACE(len); union chacha20_poly1305_open_data open_ctx = {}; +#if defined(OPENSSL_X86_64) + CHECK_ABI(chacha20_poly1305_open_nohw, buf.get(), buf.get(), len, buf.get(), + len % 128, &open_ctx); + if (CRYPTO_is_AVX2_capable() && CRYPTO_is_BMI2_capable()) { + CHECK_ABI(chacha20_poly1305_open_avx2, buf.get(), buf.get(), len, + buf.get(), len % 128, &open_ctx); + } +#else CHECK_ABI(chacha20_poly1305_open, buf.get(), buf.get(), len, buf.get(), len % 128, &open_ctx); +#endif } for (size_t len = 0; len <= 1024; len += 5) { SCOPED_TRACE(len); union chacha20_poly1305_seal_data seal_ctx = {}; +#if defined(OPENSSL_X86_64) + CHECK_ABI(chacha20_poly1305_seal_nohw, buf.get(), buf.get(), len, buf.get(), + len % 128, &seal_ctx); + if (CRYPTO_is_AVX2_capable() && CRYPTO_is_BMI2_capable()) { + CHECK_ABI(chacha20_poly1305_seal_avx2, buf.get(), buf.get(), len, + buf.get(), len % 128, &seal_ctx); + } +#else CHECK_ABI(chacha20_poly1305_seal, buf.get(), buf.get(), len, buf.get(), len % 128, &seal_ctx); +#endif } } #endif // SUPPORTS_ABI_TEST
diff --git a/crypto/cipher_extra/asm/chacha20_poly1305_x86_64.pl b/crypto/cipher_extra/asm/chacha20_poly1305_x86_64.pl index fb11760..9297052 100644 --- a/crypto/cipher_extra/asm/chacha20_poly1305_x86_64.pl +++ b/crypto/cipher_extra/asm/chacha20_poly1305_x86_64.pl
@@ -37,13 +37,9 @@ $avx = 2; $code.=<<___; -.text -.extern OPENSSL_ia32cap_P - -chacha20_poly1305_constants: - .section .rodata .align 64 +chacha20_poly1305_constants: .Lchacha20_consts: .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' @@ -444,10 +440,10 @@ # union chacha20_poly1305_open_data *aead_data) # $code.=" -.globl chacha20_poly1305_open -.type chacha20_poly1305_open,\@function,6 +.globl chacha20_poly1305_open_nohw +.type chacha20_poly1305_open_nohw,\@function,6 .align 64 -chacha20_poly1305_open: +chacha20_poly1305_open_nohw: .cfi_startproc _CET_ENDBR push %rbp @@ -485,13 +481,8 @@ $code.=" mov %rdx, $inl mov $adl, 0+$len_store - mov $inl, 8+$len_store\n"; -$code.=" - mov OPENSSL_ia32cap_P+8(%rip), %eax - and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present - xor \$`(1<<5) + (1<<8)`, %eax - jz chacha20_poly1305_open_avx2\n" if ($avx>1); -$code.=" + mov $inl, 8+$len_store + cmp \$128, $inl jbe .Lopen_sse_128 # For long buffers, prepare the poly key first @@ -858,7 +849,7 @@ movdqa $C2, $B2 movdqa $D2, $C2 jmp .Lopen_sse_128_xor_hash -.size chacha20_poly1305_open, .-chacha20_poly1305_open +.size chacha20_poly1305_open_nohw, .-chacha20_poly1305_open_nohw .cfi_endproc ################################################################################ @@ -867,10 +858,10 @@ # size_t plaintext_len, const uint8_t *ad, # size_t ad_len, # union chacha20_poly1305_seal_data *data); -.globl chacha20_poly1305_seal -.type chacha20_poly1305_seal,\@function,6 +.globl chacha20_poly1305_seal_nohw +.type chacha20_poly1305_seal_nohw,\@function,6 .align 64 -chacha20_poly1305_seal: +chacha20_poly1305_seal_nohw: .cfi_startproc _CET_ENDBR push %rbp @@ -909,13 +900,8 @@ addq %rdx, $inl mov $adl, 0+$len_store mov $inl, 8+$len_store - mov %rdx, $inl\n"; -$code.=" - mov OPENSSL_ia32cap_P+8(%rip), %eax - and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present - xor \$`(1<<5) + (1<<8)`, %eax - jz chacha20_poly1305_seal_avx2\n" if ($avx>1); -$code.=" + mov %rdx, $inl + cmp \$128, $inl jbe .Lseal_sse_128 # For longer buffers, prepare the poly key + some stream @@ -1371,7 +1357,7 @@ mov %r8, $itr2 call poly_hash_ad_internal jmp .Lseal_sse_128_tail_xor -.size chacha20_poly1305_seal, .-chacha20_poly1305_seal +.size chacha20_poly1305_seal_nohw, .-chacha20_poly1305_seal_nohw .cfi_endproc\n"; } @@ -1643,21 +1629,49 @@ $code.=" ############################################################################### -.type chacha20_poly1305_open_avx2,\@abi-omnipotent +.globl chacha20_poly1305_open_avx2 +.type chacha20_poly1305_open_avx2,\@function,6 .align 64 chacha20_poly1305_open_avx2: .cfi_startproc - -# Since the AVX2 function operates in the frame of the SSE function, we just copy the frame state to over here + _CET_ENDBR + push %rbp .cfi_push %rbp + push %rbx .cfi_push %rbx + push %r12 .cfi_push %r12 + push %r13 .cfi_push %r13 + push %r14 .cfi_push %r14 + push %r15 .cfi_push %r15 + # We write the calculated authenticator back to keyp at the end, so save + # the pointer on the stack too. + push $keyp .cfi_push $keyp + sub \$288 + $xmm_storage + 32, %rsp .cfi_adjust_cfa_offset 288 + 32 + lea 32(%rsp), %rbp + and \$-32, %rbp\n"; +$code.=" + movaps %xmm6,16*0+$xmm_store + movaps %xmm7,16*1+$xmm_store + movaps %xmm8,16*2+$xmm_store + movaps %xmm9,16*3+$xmm_store + movaps %xmm10,16*4+$xmm_store + movaps %xmm11,16*5+$xmm_store + movaps %xmm12,16*6+$xmm_store + movaps %xmm13,16*7+$xmm_store + movaps %xmm14,16*8+$xmm_store + movaps %xmm15,16*9+$xmm_store\n" if ($win64); +$code.=" + mov %rdx, $inl + mov $adl, 0+$len_store + mov $inl, 8+$len_store + vzeroupper vmovdqa .Lchacha20_consts(%rip), $A0 vbroadcasti128 0*16($keyp), $B0 @@ -2113,20 +2127,49 @@ .cfi_endproc ############################################################################### ############################################################################### -.type chacha20_poly1305_seal_avx2,\@abi-omnipotent +.globl chacha20_poly1305_seal_avx2 +.type chacha20_poly1305_seal_avx2,\@function,6 .align 64 chacha20_poly1305_seal_avx2: .cfi_startproc - -# Since the AVX2 function operates in the frame of the SSE function, we just copy the frame state to over here + _CET_ENDBR + push %rbp .cfi_push %rbp + push %rbx .cfi_push %rbx + push %r12 .cfi_push %r12 + push %r13 .cfi_push %r13 + push %r14 .cfi_push %r14 -.cfi_push %r15 + push %r15 +.cfi_push %r15 +# We write the calculated authenticator back to keyp at the end, so save +# the pointer on the stack too. + push $keyp .cfi_push $keyp + sub \$288 + $xmm_storage + 32, %rsp .cfi_adjust_cfa_offset 288 + 32 + lea 32(%rsp), %rbp + and \$-32, %rbp\n"; +$code.=" + movaps %xmm6,16*0+$xmm_store + movaps %xmm7,16*1+$xmm_store + movaps %xmm8,16*2+$xmm_store + movaps %xmm9,16*3+$xmm_store + movaps %xmm10,16*4+$xmm_store + movaps %xmm11,16*5+$xmm_store + movaps %xmm12,16*6+$xmm_store + movaps %xmm13,16*7+$xmm_store + movaps %xmm14,16*8+$xmm_store + movaps %xmm15,16*9+$xmm_store\n" if ($win64); +$code.=" + mov 56($keyp), $inl # extra_in_len + addq %rdx, $inl + mov $adl, 0+$len_store + mov $inl, 8+$len_store + mov %rdx, $inl vzeroupper vmovdqa .Lchacha20_consts(%rip), $A0
diff --git a/crypto/cipher_extra/internal.h b/crypto/cipher_extra/internal.h index 39ab950..32c0353 100644 --- a/crypto/cipher_extra/internal.h +++ b/crypto/cipher_extra/internal.h
@@ -192,22 +192,65 @@ // Additional input parameters are passed in |aead_data->in|. On exit, it will // write calculated tag value to |aead_data->out.tag|, which the caller must // check. +#if defined(OPENSSL_X86_64) +extern void chacha20_poly1305_open_nohw( + uint8_t *out_plaintext, const uint8_t *ciphertext, size_t plaintext_len, + const uint8_t *ad, size_t ad_len, union chacha20_poly1305_open_data *data); +extern void chacha20_poly1305_open_avx2( + uint8_t *out_plaintext, const uint8_t *ciphertext, size_t plaintext_len, + const uint8_t *ad, size_t ad_len, union chacha20_poly1305_open_data *data); +OPENSSL_INLINE void chacha20_poly1305_open(uint8_t *out_plaintext, + const uint8_t *ciphertext, + size_t plaintext_len, const uint8_t *ad, + size_t ad_len, + union chacha20_poly1305_open_data *data) { + if (CRYPTO_is_AVX2_capable() && CRYPTO_is_BMI2_capable()) { + chacha20_poly1305_open_avx2(out_plaintext, ciphertext, plaintext_len, ad, + ad_len, data); + } else { + chacha20_poly1305_open_nohw(out_plaintext, ciphertext, plaintext_len, ad, + ad_len, data); + } +} +#else extern void chacha20_poly1305_open(uint8_t *out_plaintext, const uint8_t *ciphertext, size_t plaintext_len, const uint8_t *ad, size_t ad_len, union chacha20_poly1305_open_data *data); +#endif // chacha20_poly1305_open is defined in chacha20_poly1305_*.pl. It encrypts // |plaintext_len| bytes from |plaintext| and writes them to |out_ciphertext|. // Additional input parameters are passed in |aead_data->in|. The calculated tag // value is over the computed ciphertext concatenated with |extra_ciphertext| // and written to |aead_data->out.tag|. +#if defined(OPENSSL_X86_64) +extern void chacha20_poly1305_seal_nohw( + uint8_t *out_ciphertext, const uint8_t *plaintext, size_t plaintext_len, + const uint8_t *ad, size_t ad_len, union chacha20_poly1305_seal_data *data); +extern void chacha20_poly1305_seal_avx2( + uint8_t *out_ciphertext, const uint8_t *plaintext, size_t plaintext_len, + const uint8_t *ad, size_t ad_len, union chacha20_poly1305_seal_data *data); +OPENSSL_INLINE void chacha20_poly1305_seal( + uint8_t *out_ciphertext, const uint8_t *plaintext, size_t plaintext_len, + const uint8_t *ad, size_t ad_len, union chacha20_poly1305_seal_data *data) { + if (CRYPTO_is_AVX2_capable() && CRYPTO_is_BMI2_capable()) { + chacha20_poly1305_seal_avx2(out_ciphertext, plaintext, plaintext_len, ad, + ad_len, data); + } else { + chacha20_poly1305_seal_nohw(out_ciphertext, plaintext, plaintext_len, ad, + ad_len, data); + } +} +#else extern void chacha20_poly1305_seal(uint8_t *out_ciphertext, const uint8_t *plaintext, size_t plaintext_len, const uint8_t *ad, size_t ad_len, union chacha20_poly1305_seal_data *data); +#endif + #else OPENSSL_INLINE int chacha20_poly1305_asm_capable(void) { return 0; }
diff --git a/gen/crypto/chacha20_poly1305_x86_64-apple.S b/gen/crypto/chacha20_poly1305_x86_64-apple.S index e4a7202..5ae5849 100644 --- a/gen/crypto/chacha20_poly1305_x86_64-apple.S +++ b/gen/crypto/chacha20_poly1305_x86_64-apple.S
@@ -4,13 +4,9 @@ #include <openssl/asm_base.h> #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) -.text - - -chacha20_poly1305_constants: - .section __DATA,__const .p2align 6 +chacha20_poly1305_constants: L$chacha20_consts: .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' @@ -216,11 +212,11 @@ -.globl _chacha20_poly1305_open -.private_extern _chacha20_poly1305_open +.globl _chacha20_poly1305_open_nohw +.private_extern _chacha20_poly1305_open_nohw .p2align 6 -_chacha20_poly1305_open: +_chacha20_poly1305_open_nohw: _CET_ENDBR pushq %rbp @@ -249,11 +245,6 @@ movq %r8,0+0+32(%rbp) movq %rbx,8+0+32(%rbp) - movl _OPENSSL_ia32cap_P+8(%rip),%eax - andl $288,%eax - xorl $288,%eax - jz chacha20_poly1305_open_avx2 - cmpq $128,%rbx jbe L$open_sse_128 @@ -2088,11 +2079,11 @@ -.globl _chacha20_poly1305_seal -.private_extern _chacha20_poly1305_seal +.globl _chacha20_poly1305_seal_nohw +.private_extern _chacha20_poly1305_seal_nohw .p2align 6 -_chacha20_poly1305_seal: +_chacha20_poly1305_seal_nohw: _CET_ENDBR pushq %rbp @@ -2122,11 +2113,6 @@ movq %rbx,8+0+32(%rbp) movq %rdx,%rbx - movl _OPENSSL_ia32cap_P+8(%rip),%eax - andl $288,%eax - xorl $288,%eax - jz chacha20_poly1305_seal_avx2 - cmpq $128,%rbx jbe L$seal_sse_128 @@ -4075,20 +4061,38 @@ +.globl _chacha20_poly1305_open_avx2 +.private_extern _chacha20_poly1305_open_avx2 .p2align 6 -chacha20_poly1305_open_avx2: +_chacha20_poly1305_open_avx2: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + pushq %r9 + + subq $288 + 0 + 32,%rsp + leaq 32(%rsp),%rbp + andq $-32,%rbp - - - - - + movq %rdx,%rbx + movq %r8,0+0+32(%rbp) + movq %rbx,8+0+32(%rbp) vzeroupper vmovdqa L$chacha20_consts(%rip),%ymm0 @@ -6223,20 +6227,39 @@ +.globl _chacha20_poly1305_seal_avx2 +.private_extern _chacha20_poly1305_seal_avx2 .p2align 6 -chacha20_poly1305_seal_avx2: +_chacha20_poly1305_seal_avx2: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + pushq %r9 + subq $288 + 0 + 32,%rsp + leaq 32(%rsp),%rbp + andq $-32,%rbp - - - - - + movq 56(%r9),%rbx + addq %rdx,%rbx + movq %r8,0+0+32(%rbp) + movq %rbx,8+0+32(%rbp) + movq %rdx,%rbx vzeroupper vmovdqa L$chacha20_consts(%rip),%ymm0
diff --git a/gen/crypto/chacha20_poly1305_x86_64-linux.S b/gen/crypto/chacha20_poly1305_x86_64-linux.S index ac38f8f..22123ee 100644 --- a/gen/crypto/chacha20_poly1305_x86_64-linux.S +++ b/gen/crypto/chacha20_poly1305_x86_64-linux.S
@@ -4,14 +4,9 @@ #include <openssl/asm_base.h> #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) -.text -.extern OPENSSL_ia32cap_P -.hidden OPENSSL_ia32cap_P - -chacha20_poly1305_constants: - .section .rodata .align 64 +chacha20_poly1305_constants: .Lchacha20_consts: .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' @@ -217,11 +212,11 @@ .cfi_endproc .size poly_hash_ad_internal, .-poly_hash_ad_internal -.globl chacha20_poly1305_open -.hidden chacha20_poly1305_open -.type chacha20_poly1305_open,@function +.globl chacha20_poly1305_open_nohw +.hidden chacha20_poly1305_open_nohw +.type chacha20_poly1305_open_nohw,@function .align 64 -chacha20_poly1305_open: +chacha20_poly1305_open_nohw: .cfi_startproc _CET_ENDBR pushq %rbp @@ -257,11 +252,6 @@ movq %r8,0+0+32(%rbp) movq %rbx,8+0+32(%rbp) - movl OPENSSL_ia32cap_P+8(%rip),%eax - andl $288,%eax - xorl $288,%eax - jz chacha20_poly1305_open_avx2 - cmpq $128,%rbx jbe .Lopen_sse_128 @@ -2094,7 +2084,7 @@ movdqa %xmm10,%xmm6 movdqa %xmm14,%xmm10 jmp .Lopen_sse_128_xor_hash -.size chacha20_poly1305_open, .-chacha20_poly1305_open +.size chacha20_poly1305_open_nohw, .-chacha20_poly1305_open_nohw .cfi_endproc @@ -2103,11 +2093,11 @@ -.globl chacha20_poly1305_seal -.hidden chacha20_poly1305_seal -.type chacha20_poly1305_seal,@function +.globl chacha20_poly1305_seal_nohw +.hidden chacha20_poly1305_seal_nohw +.type chacha20_poly1305_seal_nohw,@function .align 64 -chacha20_poly1305_seal: +chacha20_poly1305_seal_nohw: .cfi_startproc _CET_ENDBR pushq %rbp @@ -2144,11 +2134,6 @@ movq %rbx,8+0+32(%rbp) movq %rdx,%rbx - movl OPENSSL_ia32cap_P+8(%rip),%eax - andl $288,%eax - xorl $288,%eax - jz chacha20_poly1305_seal_avx2 - cmpq $128,%rbx jbe .Lseal_sse_128 @@ -4100,32 +4085,50 @@ movq %r8,%r8 call poly_hash_ad_internal jmp .Lseal_sse_128_tail_xor -.size chacha20_poly1305_seal, .-chacha20_poly1305_seal +.size chacha20_poly1305_seal_nohw, .-chacha20_poly1305_seal_nohw .cfi_endproc +.globl chacha20_poly1305_open_avx2 +.hidden chacha20_poly1305_open_avx2 .type chacha20_poly1305_open_avx2,@function .align 64 chacha20_poly1305_open_avx2: .cfi_startproc - - +_CET_ENDBR + pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 + pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 + pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 + pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 + pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 + pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 + + + pushq %r9 .cfi_adjust_cfa_offset 8 .cfi_offset %r9,-64 + subq $288 + 0 + 32,%rsp .cfi_adjust_cfa_offset 288 + 32 + leaq 32(%rsp),%rbp + andq $-32,%rbp + + movq %rdx,%rbx + movq %r8,0+0+32(%rbp) + movq %rbx,8+0+32(%rbp) + vzeroupper vmovdqa .Lchacha20_consts(%rip),%ymm0 vbroadcasti128 0(%r9),%ymm4 @@ -6259,27 +6262,46 @@ .cfi_endproc +.globl chacha20_poly1305_seal_avx2 +.hidden chacha20_poly1305_seal_avx2 .type chacha20_poly1305_seal_avx2,@function .align 64 chacha20_poly1305_seal_avx2: .cfi_startproc - - +_CET_ENDBR + pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 + pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 + pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 + pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 + pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 + pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 + + + pushq %r9 .cfi_adjust_cfa_offset 8 .cfi_offset %r9,-64 + subq $288 + 0 + 32,%rsp .cfi_adjust_cfa_offset 288 + 32 + leaq 32(%rsp),%rbp + andq $-32,%rbp + + movq 56(%r9),%rbx + addq %rdx,%rbx + movq %r8,0+0+32(%rbp) + movq %rbx,8+0+32(%rbp) + movq %rdx,%rbx vzeroupper vmovdqa .Lchacha20_consts(%rip),%ymm0
diff --git a/gen/crypto/chacha20_poly1305_x86_64-win.asm b/gen/crypto/chacha20_poly1305_x86_64-win.asm index 095689c..22dbdfe 100644 --- a/gen/crypto/chacha20_poly1305_x86_64-win.asm +++ b/gen/crypto/chacha20_poly1305_x86_64-win.asm
@@ -11,14 +11,9 @@ %ifdef BORINGSSL_PREFIX %include "boringssl_prefix_symbols_nasm.inc" %endif -section .text code align=64 - -EXTERN OPENSSL_ia32cap_P - -chacha20_poly1305_constants: - section .rdata rdata align=8 ALIGN 64 +chacha20_poly1305_constants: $L$chacha20_consts: DB 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' DB 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' @@ -55,7 +50,7 @@ DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00 DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00 DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff -section .text +section .text code align=64 @@ -225,14 +220,14 @@ -global chacha20_poly1305_open +global chacha20_poly1305_open_nohw ALIGN 64 -chacha20_poly1305_open: +chacha20_poly1305_open_nohw: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_chacha20_poly1305_open: +$L$SEH_begin_chacha20_poly1305_open_nohw: mov rdi,rcx mov rsi,rdx mov rdx,r8 @@ -280,11 +275,6 @@ mov QWORD[((0+160+32))+rbp],r8 mov QWORD[((8+160+32))+rbp],rbx - mov eax,DWORD[((OPENSSL_ia32cap_P+8))] - and eax,288 - xor eax,288 - jz NEAR chacha20_poly1305_open_avx2 - cmp rbx,128 jbe NEAR $L$open_sse_128 @@ -2123,7 +2113,7 @@ movdqa xmm6,xmm10 movdqa xmm10,xmm14 jmp NEAR $L$open_sse_128_xor_hash -$L$SEH_end_chacha20_poly1305_open: +$L$SEH_end_chacha20_poly1305_open_nohw: @@ -2132,14 +2122,14 @@ -global chacha20_poly1305_seal +global chacha20_poly1305_seal_nohw ALIGN 64 -chacha20_poly1305_seal: +chacha20_poly1305_seal_nohw: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_chacha20_poly1305_seal: +$L$SEH_begin_chacha20_poly1305_seal_nohw: mov rdi,rcx mov rsi,rdx mov rdx,r8 @@ -2188,11 +2178,6 @@ mov QWORD[((8+160+32))+rbp],rbx mov rbx,rdx - mov eax,DWORD[((OPENSSL_ia32cap_P+8))] - and eax,288 - xor eax,288 - jz NEAR chacha20_poly1305_seal_avx2 - cmp rbx,128 jbe NEAR $L$seal_sse_128 @@ -4150,24 +4135,64 @@ mov r8,r8 call poly_hash_ad_internal jmp NEAR $L$seal_sse_128_tail_xor -$L$SEH_end_chacha20_poly1305_seal: +$L$SEH_end_chacha20_poly1305_seal_nohw: +global chacha20_poly1305_open_avx2 ALIGN 64 chacha20_poly1305_open_avx2: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_chacha20_poly1305_open_avx2: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + push r9 + + sub rsp,288 + 160 + 32 + lea rbp,[32+rsp] + and rbp,-32 + movaps XMMWORD[(0+0)+rbp],xmm6 + movaps XMMWORD[(16+0)+rbp],xmm7 + movaps XMMWORD[(32+0)+rbp],xmm8 + movaps XMMWORD[(48+0)+rbp],xmm9 + movaps XMMWORD[(64+0)+rbp],xmm10 + movaps XMMWORD[(80+0)+rbp],xmm11 + movaps XMMWORD[(96+0)+rbp],xmm12 + movaps XMMWORD[(112+0)+rbp],xmm13 + movaps XMMWORD[(128+0)+rbp],xmm14 + movaps XMMWORD[(144+0)+rbp],xmm15 - + mov rbx,rdx + mov QWORD[((0+160+32))+rbp],r8 + mov QWORD[((8+160+32))+rbp],rbx vzeroupper vmovdqa ymm0,YMMWORD[$L$chacha20_consts] @@ -6298,24 +6323,65 @@ vperm2i128 ymm2,ymm6,ymm2,0x13 vperm2i128 ymm6,ymm14,ymm10,0x13 jmp NEAR $L$open_avx2_short +$L$SEH_end_chacha20_poly1305_open_avx2: - +global chacha20_poly1305_seal_avx2 ALIGN 64 chacha20_poly1305_seal_avx2: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_chacha20_poly1305_seal_avx2: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + push r9 + sub rsp,288 + 160 + 32 + lea rbp,[32+rsp] + and rbp,-32 + movaps XMMWORD[(0+0)+rbp],xmm6 + movaps XMMWORD[(16+0)+rbp],xmm7 + movaps XMMWORD[(32+0)+rbp],xmm8 + movaps XMMWORD[(48+0)+rbp],xmm9 + movaps XMMWORD[(64+0)+rbp],xmm10 + movaps XMMWORD[(80+0)+rbp],xmm11 + movaps XMMWORD[(96+0)+rbp],xmm12 + movaps XMMWORD[(112+0)+rbp],xmm13 + movaps XMMWORD[(128+0)+rbp],xmm14 + movaps XMMWORD[(144+0)+rbp],xmm15 - + mov rbx,QWORD[56+r9] + add rbx,rdx + mov QWORD[((0+160+32))+rbp],r8 + mov QWORD[((8+160+32))+rbp],rbx + mov rbx,rdx vzeroupper vmovdqa ymm0,YMMWORD[$L$chacha20_consts] @@ -8950,7 +9016,7 @@ vzeroupper jmp NEAR $L$seal_sse_tail_16 - +$L$SEH_end_chacha20_poly1305_seal_avx2: %else ; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 ret