Fix ChaCha20-Poly1305 x86-64 asm on Windows Current: Did 2916000 ChaCha20-Poly1305 (16 bytes) seal operations in 1015000us (2872906.4 ops/sec): 46.0 MB/s Did 1604750 ChaCha20-Poly1305 (256 bytes) seal operations in 1016000us (1579478.3 ops/sec): 404.3 MB/s Did 516750 ChaCha20-Poly1305 (1350 bytes) seal operations in 1015000us (509113.3 ops/sec): 687.3 MB/s Did 99750 ChaCha20-Poly1305 (8192 bytes) seal operations in 1016000us (98179.1 ops/sec): 804.3 MB/s Did 50500 ChaCha20-Poly1305 (16384 bytes) seal operations in 1016000us (49704.7 ops/sec): 814.4 MB/s With fix: Did 6366750 ChaCha20-Poly1305 (16 bytes) seal operations in 1016000us (6266486.2 ops/sec): 100.3 MB/s Did 3938000 ChaCha20-Poly1305 (256 bytes) seal operations in 1016000us (3875984.3 ops/sec): 992.3 MB/s Did 1207750 ChaCha20-Poly1305 (1350 bytes) seal operations in 1015000us (1189901.5 ops/sec): 1606.4 MB/s Did 258500 ChaCha20-Poly1305 (8192 bytes) seal operations in 1016000us (254429.1 ops/sec): 2084.3 MB/s Did 131500 ChaCha20-Poly1305 (16384 bytes) seal operations in 1016000us (129429.1 ops/sec): 2120.6 MB/s Change-Id: Iec6417b9855b9d3d1d5154c93a370f80f219c65f Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/44347 Reviewed-by: David Benjamin <davidben@google.com> Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/crypto/cipher_extra/aead_test.cc b/crypto/cipher_extra/aead_test.cc index d0c266a..bf02e78 100644 --- a/crypto/cipher_extra/aead_test.cc +++ b/crypto/cipher_extra/aead_test.cc
@@ -24,6 +24,7 @@ #include <openssl/err.h> #include "../fipsmodule/cipher/internal.h" +#include "internal.h" #include "../internal.h" #include "../test/abi_test.h" #include "../test/file_test.h" @@ -827,6 +828,27 @@ EXPECT_EQ(Bytes(plaintext + 1, sizeof(plaintext) - 1), Bytes(plaintext2 + 1, plaintext2_len)); } + +TEST(ChaChaPoly1305Test, ABI) { + if (!chacha20_poly1305_asm_capable()) { + return; + } + + std::unique_ptr<uint8_t[]> buf(new uint8_t[1024]); + for (size_t len = 0; len <= 1024; len += 5) { + SCOPED_TRACE(len); + union chacha20_poly1305_open_data open_ctx = {}; + CHECK_ABI(chacha20_poly1305_open, buf.get(), buf.get(), len, buf.get(), + len % 128, &open_ctx); + } + + for (size_t len = 0; len <= 1024; len += 5) { + SCOPED_TRACE(len); + union chacha20_poly1305_seal_data seal_ctx = {}; + CHECK_ABI(chacha20_poly1305_seal, buf.get(), buf.get(), len, buf.get(), + len % 128, &seal_ctx); + } +} #endif // SUPPORTS_ABI_TEST TEST(AEADTest, AESCCMLargeAD) {
diff --git a/crypto/cipher_extra/asm/chacha20_poly1305_x86_64.pl b/crypto/cipher_extra/asm/chacha20_poly1305_x86_64.pl index b748c23..3826cb7 100644 --- a/crypto/cipher_extra/asm/chacha20_poly1305_x86_64.pl +++ b/crypto/cipher_extra/asm/chacha20_poly1305_x86_64.pl
@@ -43,26 +43,26 @@ chacha20_poly1305_constants: .align 64 -.chacha20_consts: +.Lchacha20_consts: .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' -.rol8: +.Lrol8: .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 -.rol16: +.Lrol16: .byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 .byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 -.avx2_init: +.Lavx2_init: .long 0,0,0,0 -.sse_inc: +.Lsse_inc: .long 1,0,0,0 -.avx2_inc: +.Lavx2_inc: .long 2,0,0,0,2,0,0,0 -.clamp: +.Lclamp: .quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC .quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF .align 16 -.and_masks: +.Land_masks: .byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 @@ -81,28 +81,33 @@ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff ___ -my ($oup,$inp,$inl,$adp,$keyp,$itr1,$itr2)=("%rdi","%rsi","%rbx","%rcx","%r9","%rcx","%r8"); +my ($oup,$inp,$inl,$adp,$keyp,$itr1,$itr2,$adl)=("%rdi","%rsi","%rbx","%rcx","%r9","%rcx","%r8","%r8"); my ($acc0,$acc1,$acc2)=map("%r$_",(10..12)); my ($t0,$t1,$t2,$t3)=("%r13","%r14","%r15","%r9"); my ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%xmm$_",(0..15)); my ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3); -my $r_store="0*16(%rbp)"; -my $s_store="1*16(%rbp)"; -my $len_store="2*16(%rbp)"; -my $state1_store="3*16(%rbp)"; -my $state2_store="4*16(%rbp)"; -my $tmp_store="5*16(%rbp)"; -my $ctr0_store="6*16(%rbp)"; -my $ctr1_store="7*16(%rbp)"; -my $ctr2_store="8*16(%rbp)"; -my $ctr3_store="9*16(%rbp)"; +my $xmm_storage = 0; +if ($win64) { + $xmm_storage = 10*16; +} +my $xmm_store="0*16(%rbp)"; +my $r_store="$xmm_storage+0*16(%rbp)"; +my $s_store="$xmm_storage+1*16(%rbp)"; +my $len_store="$xmm_storage+2*16(%rbp)"; +my $state1_store="$xmm_storage+3*16(%rbp)"; +my $state2_store="$xmm_storage+4*16(%rbp)"; +my $tmp_store="$xmm_storage+5*16(%rbp)"; +my $ctr0_store="$xmm_storage+6*16(%rbp)"; +my $ctr1_store="$xmm_storage+7*16(%rbp)"; +my $ctr2_store="$xmm_storage+8*16(%rbp)"; +my $ctr3_store="$xmm_storage+9*16(%rbp)"; sub chacha_qr { my ($a,$b,$c,$d,$t,$dir)=@_; $code.="movdqa $t, $tmp_store\n" if ($dir =~ /store/); $code.="paddd $b, $a pxor $a, $d - pshufb .rol16(%rip), $d + pshufb .Lrol16(%rip), $d paddd $d, $c pxor $c, $b movdqa $b, $t @@ -111,7 +116,7 @@ pxor $t, $b paddd $b, $a pxor $a, $d - pshufb .rol8(%rip), $d + pshufb .Lrol8(%rip), $d paddd $d, $c pxor $c, $b movdqa $b, $t @@ -129,7 +134,7 @@ sub poly_add { my ($src)=@_; -$code.="add $src, $acc0 +$code.="add 0+$src, $acc0 adc 8+$src, $acc1 adc \$1, $acc2\n"; } @@ -166,22 +171,26 @@ adc %rdx, $t3\n"; } +# At the beginning of the reduce stage t = [t3:t2:t1:t0] is a product of +# r = [r1:r0] and acc = [acc2:acc1:acc0] +# r is 124 bits at most (due to clamping) and acc is 131 bits at most +# (acc2 is at most 4 before the addition and can be at most 6 when we add in +# the next block) therefore t is at most 255 bits big, and t3 is 63 bits. sub poly_reduce_stage { $code.="mov $t0, $acc0 mov $t1, $acc1 mov $t2, $acc2 - and \$3, $acc2 + and \$3, $acc2 # At this point acc2 is 2 bits at most (value of 3) mov $t2, $t0 and \$-4, $t0 mov $t3, $t1 shrd \$2, $t3, $t2 shr \$2, $t3 - add $t0, $acc0 - adc $t1, $acc1 - adc \$0, $acc2 + add $t0, $t2 + adc $t1, $t3 # No carry out since t3 is 61 bits and t1 is 63 bits add $t2, $acc0 adc $t3, $acc1 - adc \$0, $acc2\n"; + adc \$0, $acc2\n"; # At this point acc2 has the value of 4 at most } sub poly_mul { @@ -193,7 +202,7 @@ sub prep_state { my ($n)=@_; -$code.="movdqa .chacha20_consts(%rip), $A0 +$code.="movdqa .Lchacha20_consts(%rip), $A0 movdqa $state1_store, $B0 movdqa $state2_store, $C0\n"; $code.="movdqa $A0, $A1 @@ -206,31 +215,31 @@ movdqa $B0, $B3 movdqa $C0, $C3\n" if ($n ge 4); $code.="movdqa $ctr0_store, $D0 - paddd .sse_inc(%rip), $D0 + paddd .Lsse_inc(%rip), $D0 movdqa $D0, $ctr0_store\n" if ($n eq 1); $code.="movdqa $ctr0_store, $D1 - paddd .sse_inc(%rip), $D1 + paddd .Lsse_inc(%rip), $D1 movdqa $D1, $D0 - paddd .sse_inc(%rip), $D0 + paddd .Lsse_inc(%rip), $D0 movdqa $D0, $ctr0_store movdqa $D1, $ctr1_store\n" if ($n eq 2); $code.="movdqa $ctr0_store, $D2 - paddd .sse_inc(%rip), $D2 + paddd .Lsse_inc(%rip), $D2 movdqa $D2, $D1 - paddd .sse_inc(%rip), $D1 + paddd .Lsse_inc(%rip), $D1 movdqa $D1, $D0 - paddd .sse_inc(%rip), $D0 + paddd .Lsse_inc(%rip), $D0 movdqa $D0, $ctr0_store movdqa $D1, $ctr1_store movdqa $D2, $ctr2_store\n" if ($n eq 3); $code.="movdqa $ctr0_store, $D3 - paddd .sse_inc(%rip), $D3 + paddd .Lsse_inc(%rip), $D3 movdqa $D3, $D2 - paddd .sse_inc(%rip), $D2 + paddd .Lsse_inc(%rip), $D2 movdqa $D2, $D1 - paddd .sse_inc(%rip), $D1 + paddd .Lsse_inc(%rip), $D1 movdqa $D1, $D0 - paddd .sse_inc(%rip), $D0 + paddd .Lsse_inc(%rip), $D0 movdqa $D0, $ctr0_store movdqa $D1, $ctr1_store movdqa $D2, $ctr2_store @@ -239,19 +248,19 @@ sub finalize_state { my ($n)=@_; -$code.="paddd .chacha20_consts(%rip), $A3 +$code.="paddd .Lchacha20_consts(%rip), $A3 paddd $state1_store, $B3 paddd $state2_store, $C3 paddd $ctr3_store, $D3\n" if ($n eq 4); -$code.="paddd .chacha20_consts(%rip), $A2 +$code.="paddd .Lchacha20_consts(%rip), $A2 paddd $state1_store, $B2 paddd $state2_store, $C2 paddd $ctr2_store, $D2\n" if ($n ge 3); -$code.="paddd .chacha20_consts(%rip), $A1 +$code.="paddd .Lchacha20_consts(%rip), $A1 paddd $state1_store, $B1 paddd $state2_store, $C1 paddd $ctr1_store, $D1\n" if ($n ge 2); -$code.="paddd .chacha20_consts(%rip), $A0 +$code.="paddd .Lchacha20_consts(%rip), $A0 paddd $state1_store, $B0 paddd $state2_store, $C0 paddd $ctr0_store, $D0\n"; @@ -352,10 +361,10 @@ return $round; }; -$chacha_body = &gen_chacha_round(20, ".rol16(%rip)") . - &gen_chacha_round(25, ".rol8(%rip)", "left") . - &gen_chacha_round(20, ".rol16(%rip)") . - &gen_chacha_round(25, ".rol8(%rip)", "right"); +$chacha_body = &gen_chacha_round(20, ".Lrol16(%rip)") . + &gen_chacha_round(25, ".Lrol8(%rip)", "left") . + &gen_chacha_round(20, ".Lrol16(%rip)") . + &gen_chacha_round(25, ".Lrol8(%rip)", "right"); my @loop_body = split /\n/, $chacha_body; @@ -370,16 +379,17 @@ ################################################################################ # void poly_hash_ad_internal(); $code.=" -.type poly_hash_ad_internal,\@function,2 +.type poly_hash_ad_internal,\@abi-omnipotent .align 64 poly_hash_ad_internal: .cfi_startproc +.cfi_def_cfa rsp, 8 xor $acc0, $acc0 xor $acc1, $acc1 xor $acc2, $acc2 cmp \$13, $itr2 - jne hash_ad_loop -poly_fast_tls_ad: + jne .Lhash_ad_loop +.Lpoly_fast_tls_ad: # Special treatment for the TLS case of 13 bytes mov ($adp), $acc0 mov 5($adp), $acc1 @@ -387,38 +397,38 @@ mov \$1, $acc2\n"; &poly_mul(); $code.=" ret -hash_ad_loop: +.Lhash_ad_loop: # Hash in 16 byte chunk cmp \$16, $itr2 - jb hash_ad_tail\n"; + jb .Lhash_ad_tail\n"; &poly_add("0($adp)"); &poly_mul(); $code.=" lea 1*16($adp), $adp sub \$16, $itr2 - jmp hash_ad_loop -hash_ad_tail: + jmp .Lhash_ad_loop +.Lhash_ad_tail: cmp \$0, $itr2 - je 1f + je .Lhash_ad_done # Hash last < 16 byte tail xor $t0, $t0 xor $t1, $t1 xor $t2, $t2 add $itr2, $adp -hash_ad_tail_loop: +.Lhash_ad_tail_loop: shld \$8, $t0, $t1 shl \$8, $t0 movzxb -1($adp), $t2 xor $t2, $t0 dec $adp dec $itr2 - jne hash_ad_tail_loop + jne .Lhash_ad_tail_loop add $t0, $acc0 adc $t1, $acc1 adc \$1, $acc2\n"; &poly_mul(); $code.=" # Finished AD -1: +.Lhash_ad_done: ret .cfi_endproc .size poly_hash_ad_internal, .-poly_hash_ad_internal\n"; @@ -426,86 +436,98 @@ { ################################################################################ -# void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp); +# extern void chacha20_poly1305_open(uint8_t *out_plaintext, +# const uint8_t *ciphertext, +# size_t plaintext_len, const uint8_t *ad, +# size_t ad_len, union open_data *aead_data) +# $code.=" .globl chacha20_poly1305_open -.type chacha20_poly1305_open,\@function,2 +.type chacha20_poly1305_open,\@function,6 .align 64 chacha20_poly1305_open: .cfi_startproc push %rbp -.cfi_adjust_cfa_offset 8 +.cfi_push %rbp push %rbx -.cfi_adjust_cfa_offset 8 +.cfi_push %rbx push %r12 -.cfi_adjust_cfa_offset 8 +.cfi_push %r12 push %r13 -.cfi_adjust_cfa_offset 8 +.cfi_push %r13 push %r14 -.cfi_adjust_cfa_offset 8 +.cfi_push %r14 push %r15 -.cfi_adjust_cfa_offset 8 +.cfi_push %r15 # We write the calculated authenticator back to keyp at the end, so save # the pointer on the stack too. push $keyp -.cfi_adjust_cfa_offset 8 - sub \$288 + 32, %rsp +.cfi_push $keyp + sub \$288 + $xmm_storage + 32, %rsp .cfi_adjust_cfa_offset 288 + 32 -.cfi_offset rbp, -16 -.cfi_offset rbx, -24 -.cfi_offset r12, -32 -.cfi_offset r13, -40 -.cfi_offset r14, -48 -.cfi_offset r15, -56 + lea 32(%rsp), %rbp - and \$-32, %rbp - mov %rdx, 8+$len_store - mov %r8, 0+$len_store - mov %rdx, $inl\n"; $code.=" + and \$-32, %rbp\n"; +$code.=" + movaps %xmm6,16*0+$xmm_store + movaps %xmm7,16*1+$xmm_store + movaps %xmm8,16*2+$xmm_store + movaps %xmm9,16*3+$xmm_store + movaps %xmm10,16*4+$xmm_store + movaps %xmm11,16*5+$xmm_store + movaps %xmm12,16*6+$xmm_store + movaps %xmm13,16*7+$xmm_store + movaps %xmm14,16*8+$xmm_store + movaps %xmm15,16*9+$xmm_store\n" if ($win64); +$code.=" + mov %rdx, $inl + mov $adl, 0+$len_store + mov $inl, 8+$len_store\n"; +$code.=" mov OPENSSL_ia32cap_P+8(%rip), %eax and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present xor \$`(1<<5) + (1<<8)`, %eax - jz chacha20_poly1305_open_avx2\n" if ($avx>1); + jz chacha20_poly1305_open_avx2\n" if ($avx>1); $code.=" -1: cmp \$128, $inl - jbe open_sse_128 + jbe .Lopen_sse_128 # For long buffers, prepare the poly key first - movdqa .chacha20_consts(%rip), $A0 + movdqa .Lchacha20_consts(%rip), $A0 movdqu 0*16($keyp), $B0 movdqu 1*16($keyp), $C0 movdqu 2*16($keyp), $D0 + movdqa $D0, $T1 # Store on stack, to free keyp movdqa $B0, $state1_store movdqa $C0, $state2_store movdqa $D0, $ctr0_store mov \$10, $acc0 -1: \n"; +.Lopen_sse_init_rounds:\n"; &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.=" dec $acc0 - jne 1b + jne .Lopen_sse_init_rounds # A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - paddd .chacha20_consts(%rip), $A0 + paddd .Lchacha20_consts(%rip), $A0 paddd $state1_store, $B0 # Clamp and store the key - pand .clamp(%rip), $A0 + pand .Lclamp(%rip), $A0 movdqa $A0, $r_store movdqa $B0, $s_store # Hash - mov %r8, $itr2 + mov $adl, $itr2 call poly_hash_ad_internal -open_sse_main_loop: +.Lopen_sse_main_loop: cmp \$16*16, $inl - jb 2f + jb .Lopen_sse_tail # Load state, increment counter blocks\n"; &prep_state(4); $code.=" # There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we # hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16 mov \$4, $itr1 mov $inp, $itr2 -1: \n"; +.Lopen_sse_main_loop_rounds:\n"; &emit_body(20); &poly_add("0($itr2)"); $code.=" lea 2*8($itr2), $itr2\n"; @@ -520,12 +542,12 @@ foreach $l (@loop_body) {$code.=$l."\n";} @loop_body = split /\n/, $chacha_body; $code.=" dec $itr1 - jge 1b\n"; + jge .Lopen_sse_main_loop_rounds\n"; &poly_add("0($itr2)"); &poly_mul(); $code.=" lea 2*8($itr2), $itr2 cmp \$-6, $itr1 - jg 1b\n"; + jg .Lopen_sse_main_loop_rounds\n"; &finalize_state(4); &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0); &xor_stream($A2, $B2, $C2, $D2, "4*16"); @@ -534,66 +556,66 @@ lea 16*16($inp), $inp lea 16*16($oup), $oup sub \$16*16, $inl - jmp open_sse_main_loop -2: + jmp .Lopen_sse_main_loop +.Lopen_sse_tail: # Handle the various tail sizes efficiently test $inl, $inl - jz open_sse_finalize + jz .Lopen_sse_finalize + cmp \$12*16, $inl + ja .Lopen_sse_tail_256 + cmp \$8*16, $inl + ja .Lopen_sse_tail_192 cmp \$4*16, $inl - ja 3f\n"; + ja .Lopen_sse_tail_128\n"; ############################################################################### # At most 64 bytes are left &prep_state(1); $code.=" xor $itr2, $itr2 mov $inl, $itr1 cmp \$16, $itr1 - jb 2f -1: \n"; - &poly_add("0($inp, $itr2)"); + jb .Lopen_sse_tail_64_rounds +.Lopen_sse_tail_64_rounds_and_x1hash: \n"; + &poly_add("0($inp,$itr2)"); &poly_mul(); $code.=" sub \$16, $itr1 -2: +.Lopen_sse_tail_64_rounds: add \$16, $itr2\n"; &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.=" cmp \$16, $itr1 - jae 1b + jae .Lopen_sse_tail_64_rounds_and_x1hash cmp \$10*16, $itr2 - jne 2b\n"; + jne .Lopen_sse_tail_64_rounds\n"; &finalize_state(1); $code.=" - jmp open_sse_tail_64_dec_loop -3: - cmp \$8*16, $inl - ja 3f\n"; + jmp .Lopen_sse_tail_64_dec_loop ############################################################################### +.Lopen_sse_tail_128:\n"; # 65 - 128 bytes are left &prep_state(2); $code.=" mov $inl, $itr1 and \$-16, $itr1 xor $itr2, $itr2 -1: \n"; - &poly_add("0($inp, $itr2)"); +.Lopen_sse_tail_128_rounds_and_x1hash: \n"; + &poly_add("0($inp,$itr2)"); &poly_mul(); $code.=" -2: +.Lopen_sse_tail_128_rounds: add \$16, $itr2\n"; &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");$code.=" cmp $itr1, $itr2 - jb 1b + jb .Lopen_sse_tail_128_rounds_and_x1hash cmp \$10*16, $itr2 - jne 2b\n"; + jne .Lopen_sse_tail_128_rounds\n"; &finalize_state(2); &xor_stream($A1, $B1, $C1, $D1, "0*16"); $code.=" sub \$4*16, $inl lea 4*16($inp), $inp lea 4*16($oup), $oup - jmp open_sse_tail_64_dec_loop -3: - cmp \$12*16, $inl - ja 3f\n"; + jmp .Lopen_sse_tail_64_dec_loop ############################################################################### +.Lopen_sse_tail_192:\n"; # 129 - 192 bytes are left &prep_state(3); $code.=" mov $inl, $itr1 @@ -602,10 +624,10 @@ cmovg $itr2, $itr1 and \$-16, $itr1 xor $itr2, $itr2 -1: \n"; - &poly_add("0($inp, $itr2)"); +.Lopen_sse_tail_192_rounds_and_x1hash: \n"; + &poly_add("0($inp,$itr2)"); &poly_mul(); $code.=" -2: +.Lopen_sse_tail_192_rounds: add \$16, $itr2\n"; &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); @@ -614,32 +636,32 @@ &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" cmp $itr1, $itr2 - jb 1b + jb .Lopen_sse_tail_192_rounds_and_x1hash cmp \$10*16, $itr2 - jne 2b + jne .Lopen_sse_tail_192_rounds cmp \$11*16, $inl - jb 1f\n"; + jb .Lopen_sse_tail_192_finish\n"; &poly_add("10*16($inp)"); &poly_mul(); $code.=" cmp \$12*16, $inl - jb 1f\n"; + jb .Lopen_sse_tail_192_finish\n"; &poly_add("11*16($inp)"); &poly_mul(); $code.=" -1: \n"; +.Lopen_sse_tail_192_finish: \n"; &finalize_state(3); &xor_stream($A2, $B2, $C2, $D2, "0*16"); &xor_stream($A1, $B1, $C1, $D1, "4*16"); $code.=" sub \$8*16, $inl lea 8*16($inp), $inp lea 8*16($oup), $oup - jmp open_sse_tail_64_dec_loop -3: -###############################################################################\n"; + jmp .Lopen_sse_tail_64_dec_loop +############################################################################### +.Lopen_sse_tail_256:\n"; # 193 - 255 bytes are left &prep_state(4); $code.=" xor $itr2, $itr2 -1: \n"; - &poly_add("0($inp, $itr2)"); +.Lopen_sse_tail_256_rounds_and_x1hash: \n"; + &poly_add("0($inp,$itr2)"); &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_left"); &chacha_qr($A1,$B1,$C1,$D1,$C3,"left"); &chacha_qr($A2,$B2,$C2,$D2,$C3,"left_load"); @@ -654,15 +676,16 @@ &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_right_load"); $code.=" add \$16, $itr2 cmp \$10*16, $itr2 - jb 1b + jb .Lopen_sse_tail_256_rounds_and_x1hash + mov $inl, $itr1 and \$-16, $itr1 -1: \n"; - &poly_add("0($inp, $itr2)"); +.Lopen_sse_tail_256_hash: \n"; + &poly_add("0($inp,$itr2)"); &poly_mul(); $code.=" add \$16, $itr2 cmp $itr1, $itr2 - jb 1b\n"; + jb .Lopen_sse_tail_256_hash\n"; &finalize_state(4); &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0); &xor_stream($A2, $B2, $C2, $D2, "4*16"); @@ -673,9 +696,9 @@ lea 12*16($oup), $oup ############################################################################### # Decrypt the remaining data, 16B at a time, using existing stream -open_sse_tail_64_dec_loop: +.Lopen_sse_tail_64_dec_loop: cmp \$16, $inl - jb 1f + jb .Lopen_sse_tail_16_init sub \$16, $inl movdqu ($inp), $T0 pxor $T0, $A0 @@ -685,47 +708,46 @@ movdqa $B0, $A0 movdqa $C0, $B0 movdqa $D0, $C0 - jmp open_sse_tail_64_dec_loop -1: + jmp .Lopen_sse_tail_64_dec_loop +.Lopen_sse_tail_16_init: movdqa $A0, $A1 # Decrypt up to 16 bytes at the end. -open_sse_tail_16: +.Lopen_sse_tail_16: test $inl, $inl - jz open_sse_finalize + jz .Lopen_sse_finalize # Read the final bytes into $T0. They need to be read in reverse order so # that they end up in the correct order in $T0. pxor $T0, $T0 - lea -1($inp, $inl), $inp + lea -1($inp,$inl), $inp movq $inl, $itr2 -2: +.Lopen_sse_tail_16_compose: pslldq \$1, $T0 pinsrb \$0, ($inp), $T0 sub \$1, $inp sub \$1, $itr2 - jnz 2b + jnz .Lopen_sse_tail_16_compose -3: movq $T0, $t0 pextrq \$1, $T0, $t1 # The final bytes of keystream are in $A1. pxor $A1, $T0 # Copy the plaintext bytes out. -2: +.Lopen_sse_tail_16_extract: pextrb \$0, $T0, ($oup) psrldq \$1, $T0 add \$1, $oup sub \$1, $inl - jne 2b + jne .Lopen_sse_tail_16_extract add $t0, $acc0 adc $t1, $acc1 adc \$1, $acc2\n"; &poly_mul(); $code.=" -open_sse_finalize:\n"; +.Lopen_sse_finalize:\n"; &poly_add($len_store); &poly_mul(); $code.=" # Final reduce @@ -740,40 +762,54 @@ cmovc $t2, $acc2 # Add in s part of the key add 0+$s_store, $acc0 - adc 8+$s_store, $acc1 + adc 8+$s_store, $acc1\n"; - add \$288 + 32, %rsp +$code.=" + movaps 16*0+$xmm_store, %xmm6 + movaps 16*1+$xmm_store, %xmm7 + movaps 16*2+$xmm_store, %xmm8 + movaps 16*3+$xmm_store, %xmm9 + movaps 16*4+$xmm_store, %xmm10 + movaps 16*5+$xmm_store, %xmm11 + movaps 16*6+$xmm_store, %xmm12 + movaps 16*7+$xmm_store, %xmm13 + movaps 16*8+$xmm_store, %xmm14 + movaps 16*9+$xmm_store, %xmm15\n" if ($win64); +$code.=" +.cfi_remember_state + add \$288 + $xmm_storage + 32, %rsp .cfi_adjust_cfa_offset -(288 + 32) + # The tag replaces the key on return pop $keyp -.cfi_adjust_cfa_offset -8 - movq $acc0, ($keyp) - movq $acc1, 8($keyp) - +.cfi_pop $keyp + mov $acc0, ($keyp) + mov $acc1, 8($keyp) pop %r15 -.cfi_adjust_cfa_offset -8 +.cfi_pop %r15 pop %r14 -.cfi_adjust_cfa_offset -8 +.cfi_pop %r14 pop %r13 -.cfi_adjust_cfa_offset -8 +.cfi_pop %r13 pop %r12 -.cfi_adjust_cfa_offset -8 +.cfi_pop %r12 pop %rbx -.cfi_adjust_cfa_offset -8 +.cfi_pop %rbx pop %rbp -.cfi_adjust_cfa_offset -8 +.cfi_pop %rbp ret -.cfi_adjust_cfa_offset (8 * 6) + 288 + 32 ############################################################################### -open_sse_128: - movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2 +.Lopen_sse_128: +.cfi_restore_state + movdqu .Lchacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2 movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2 movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2 movdqu 2*16($keyp), $D0 - movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1 - movdqa $D1, $D2\npaddd .sse_inc(%rip), $D2 + movdqa $D0, $D1\npaddd .Lsse_inc(%rip), $D1 + movdqa $D1, $D2\npaddd .Lsse_inc(%rip), $D2 movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D1, $T3 mov \$10, $acc0 -1: \n"; + +.Lopen_sse_128_rounds: \n"; &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); @@ -781,25 +817,25 @@ &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" dec $acc0 - jnz 1b - paddd .chacha20_consts(%rip), $A0 - paddd .chacha20_consts(%rip), $A1 - paddd .chacha20_consts(%rip), $A2 + jnz .Lopen_sse_128_rounds + paddd .Lchacha20_consts(%rip), $A0 + paddd .Lchacha20_consts(%rip), $A1 + paddd .Lchacha20_consts(%rip), $A2 paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2 paddd $T2, $C1\npaddd $T2, $C2 paddd $T3, $D1 - paddd .sse_inc(%rip), $T3 + paddd .Lsse_inc(%rip), $T3 paddd $T3, $D2 # Clamp and store the key - pand .clamp(%rip), $A0 + pand .Lclamp(%rip), $A0 movdqa $A0, $r_store movdqa $B0, $s_store # Hash - mov %r8, $itr2 + mov $adl, $itr2 call poly_hash_ad_internal -1: +.Lopen_sse_128_xor_hash: cmp \$16, $inl - jb open_sse_tail_16 + jb .Lopen_sse_tail_16 sub \$16, $inl\n"; # Load for hashing &poly_add("0*8($inp)"); $code.=" @@ -818,62 +854,69 @@ movdqa $B2, $A2 movdqa $C2, $B2 movdqa $D2, $C2 - jmp 1b - jmp open_sse_tail_16 + jmp .Lopen_sse_128_xor_hash .size chacha20_poly1305_open, .-chacha20_poly1305_open .cfi_endproc ################################################################################ ################################################################################ -# void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp); +# void chacha20_poly1305_seal(uint8_t *in_out, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t keyp[48]); .globl chacha20_poly1305_seal -.type chacha20_poly1305_seal,\@function,2 +.type chacha20_poly1305_seal,\@function,6 .align 64 chacha20_poly1305_seal: .cfi_startproc push %rbp -.cfi_adjust_cfa_offset 8 +.cfi_push %rbp push %rbx -.cfi_adjust_cfa_offset 8 +.cfi_push %rbx push %r12 -.cfi_adjust_cfa_offset 8 +.cfi_push %r12 push %r13 -.cfi_adjust_cfa_offset 8 +.cfi_push %r13 push %r14 -.cfi_adjust_cfa_offset 8 +.cfi_push %r14 push %r15 -.cfi_adjust_cfa_offset 8 - # We write the calculated authenticator back to keyp at the end, so save - # the pointer on the stack too. +.cfi_push %r15 +# We write the calculated authenticator back to keyp at the end, so save +# the pointer on the stack too. push $keyp -.cfi_adjust_cfa_offset 8 - sub \$288 + 32, %rsp +.cfi_push $keyp + sub \$288 + $xmm_storage + 32, %rsp .cfi_adjust_cfa_offset 288 + 32 -.cfi_offset rbp, -16 -.cfi_offset rbx, -24 -.cfi_offset r12, -32 -.cfi_offset r13, -40 -.cfi_offset r14, -48 -.cfi_offset r15, -56 lea 32(%rsp), %rbp - and \$-32, %rbp + and \$-32, %rbp\n"; +$code.=" + movaps %xmm6,16*0+$xmm_store + movaps %xmm7,16*1+$xmm_store + movaps %xmm8,16*2+$xmm_store + movaps %xmm9,16*3+$xmm_store + movaps %xmm10,16*4+$xmm_store + movaps %xmm11,16*5+$xmm_store + movaps %xmm12,16*6+$xmm_store + movaps %xmm13,16*7+$xmm_store + movaps %xmm14,16*8+$xmm_store + movaps %xmm15,16*9+$xmm_store\n" if ($win64); +$code.=" mov 56($keyp), $inl # extra_in_len addq %rdx, $inl + mov $adl, 0+$len_store mov $inl, 8+$len_store - mov %r8, 0+$len_store - mov %rdx, $inl\n"; $code.=" + mov %rdx, $inl\n"; +$code.=" mov OPENSSL_ia32cap_P+8(%rip), %eax and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present xor \$`(1<<5) + (1<<8)`, %eax - jz chacha20_poly1305_seal_avx2\n" if ($avx>1); + jz chacha20_poly1305_seal_avx2\n" if ($avx>1); $code.=" cmp \$128, $inl - jbe seal_sse_128 + jbe .Lseal_sse_128 # For longer buffers, prepare the poly key + some stream - movdqa .chacha20_consts(%rip), $A0 + movdqa .Lchacha20_consts(%rip), $A0 movdqu 0*16($keyp), $B0 movdqu 1*16($keyp), $C0 movdqu 2*16($keyp), $D0 + movdqa $A0, $A1 movdqa $A0, $A2 movdqa $A0, $A3 @@ -884,11 +927,11 @@ movdqa $C0, $C2 movdqa $C0, $C3 movdqa $D0, $D3 - paddd .sse_inc(%rip), $D0 + paddd .Lsse_inc(%rip), $D0 movdqa $D0, $D2 - paddd .sse_inc(%rip), $D0 + paddd .Lsse_inc(%rip), $D0 movdqa $D0, $D1 - paddd .sse_inc(%rip), $D0 + paddd .Lsse_inc(%rip), $D0 # Store on stack movdqa $B0, $state1_store movdqa $C0, $state2_store @@ -897,28 +940,28 @@ movdqa $D2, $ctr2_store movdqa $D3, $ctr3_store mov \$10, $acc0 -1: \n"; +.Lseal_sse_init_rounds: \n"; foreach $l (@loop_body) {$code.=$l."\n";} @loop_body = split /\n/, $chacha_body; $code.=" dec $acc0 - jnz 1b\n"; + jnz .Lseal_sse_init_rounds\n"; &finalize_state(4); $code.=" # Clamp and store the key - pand .clamp(%rip), $A3 + pand .Lclamp(%rip), $A3 movdqa $A3, $r_store movdqa $B3, $s_store # Hash - mov %r8, $itr2 + mov $adl, $itr2 call poly_hash_ad_internal\n"; &xor_stream($A2,$B2,$C2,$D2,"0*16"); &xor_stream($A1,$B1,$C1,$D1,"4*16"); $code.=" cmp \$12*16, $inl - ja 1f + ja .Lseal_sse_main_init mov \$8*16, $itr1 sub \$8*16, $inl lea 8*16($inp), $inp - jmp seal_sse_128_seal_hash -1: \n"; + jmp .Lseal_sse_128_tail_hash +.Lseal_sse_main_init:\n"; &xor_stream($A0, $B0, $C0, $D0, "8*16"); $code.=" mov \$12*16, $itr1 sub \$12*16, $inl @@ -926,16 +969,17 @@ mov \$2, $itr1 mov \$8, $itr2 cmp \$4*16, $inl - jbe seal_sse_tail_64 + jbe .Lseal_sse_tail_64 cmp \$8*16, $inl - jbe seal_sse_tail_128 + jbe .Lseal_sse_tail_128 cmp \$12*16, $inl - jbe seal_sse_tail_192 + jbe .Lseal_sse_tail_192 -1: \n"; +.Lseal_sse_main_loop: \n"; # The main loop &prep_state(4); $code.=" -2: \n"; +.align 32 +.Lseal_sse_main_rounds: \n"; &emit_body(20); &poly_add("0($oup)"); &emit_body(20); @@ -950,12 +994,12 @@ @loop_body = split /\n/, $chacha_body; $code.=" lea 16($oup), $oup dec $itr2 - jge 2b\n"; + jge .Lseal_sse_main_rounds\n"; &poly_add("0*8($oup)"); &poly_mul(); $code.=" lea 16($oup), $oup dec $itr1 - jg 2b\n"; + jg .Lseal_sse_main_rounds\n"; &finalize_state(4);$code.=" movdqa $D2, $tmp_store\n"; @@ -964,56 +1008,55 @@ &xor_stream($A2,$B2,$C2,$D2, 4*16); &xor_stream($A1,$B1,$C1,$D1, 8*16); $code.=" cmp \$16*16, $inl - ja 3f + ja .Lseal_sse_main_loop_xor mov \$12*16, $itr1 sub \$12*16, $inl lea 12*16($inp), $inp - jmp seal_sse_128_seal_hash -3: \n"; + jmp .Lseal_sse_128_tail_hash +.Lseal_sse_main_loop_xor: \n"; &xor_stream($A0,$B0,$C0,$D0,"12*16"); $code.=" lea 16*16($inp), $inp sub \$16*16, $inl mov \$6, $itr1 mov \$4, $itr2 cmp \$12*16, $inl - jg 1b + jg .Lseal_sse_main_loop mov $inl, $itr1 test $inl, $inl - je seal_sse_128_seal_hash + je .Lseal_sse_128_tail_hash mov \$6, $itr1 + cmp \$8*16, $inl + ja .Lseal_sse_tail_192 cmp \$4*16, $inl - jg 3f + ja .Lseal_sse_tail_128 ############################################################################### -seal_sse_tail_64:\n"; +.Lseal_sse_tail_64: \n"; &prep_state(1); $code.=" -1: \n"; +.Lseal_sse_tail_64_rounds_and_x2hash: \n"; &poly_add("0($oup)"); &poly_mul(); $code.=" lea 16($oup), $oup -2: \n"; +.Lseal_sse_tail_64_rounds_and_x1hash: \n"; &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); &poly_add("0($oup)"); &poly_mul(); $code.=" lea 16($oup), $oup dec $itr1 - jg 1b + jg .Lseal_sse_tail_64_rounds_and_x2hash dec $itr2 - jge 2b\n"; + jge .Lseal_sse_tail_64_rounds_and_x1hash\n"; &finalize_state(1); $code.=" - jmp seal_sse_128_seal -3: - cmp \$8*16, $inl - jg 3f + jmp .Lseal_sse_128_tail_xor ############################################################################### -seal_sse_tail_128:\n"; +.Lseal_sse_tail_128:\n"; &prep_state(2); $code.=" -1: \n"; +.Lseal_sse_tail_128_rounds_and_x2hash: \n"; &poly_add("0($oup)"); &poly_mul(); $code.=" lea 16($oup), $oup -2: \n"; +.Lseal_sse_tail_128_rounds_and_x1hash: \n"; &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); &poly_add("0($oup)"); @@ -1022,24 +1065,23 @@ &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); $code.=" lea 16($oup), $oup dec $itr1 - jg 1b + jg .Lseal_sse_tail_128_rounds_and_x2hash dec $itr2 - jge 2b\n"; + jge .Lseal_sse_tail_128_rounds_and_x1hash\n"; &finalize_state(2); &xor_stream($A1,$B1,$C1,$D1,0*16); $code.=" mov \$4*16, $itr1 sub \$4*16, $inl lea 4*16($inp), $inp - jmp seal_sse_128_seal_hash -3: + jmp .Lseal_sse_128_tail_hash ############################################################################### -seal_sse_tail_192:\n"; +.Lseal_sse_tail_192:\n"; &prep_state(3); $code.=" -1: \n"; +.Lseal_sse_tail_192_rounds_and_x2hash: \n"; &poly_add("0($oup)"); &poly_mul(); $code.=" lea 16($oup), $oup -2: \n"; +.Lseal_sse_tail_192_rounds_and_x1hash: \n"; &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); @@ -1050,9 +1092,9 @@ &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" lea 16($oup), $oup dec $itr1 - jg 1b + jg .Lseal_sse_tail_192_rounds_and_x2hash dec $itr2 - jge 2b\n"; + jge .Lseal_sse_tail_192_rounds_and_x1hash\n"; &finalize_state(3); &xor_stream($A2,$B2,$C2,$D2,0*16); &xor_stream($A1,$B1,$C1,$D1,4*16); $code.=" @@ -1060,18 +1102,18 @@ sub \$8*16, $inl lea 8*16($inp), $inp ############################################################################### -seal_sse_128_seal_hash: +.Lseal_sse_128_tail_hash: cmp \$16, $itr1 - jb seal_sse_128_seal\n"; + jb .Lseal_sse_128_tail_xor\n"; &poly_add("0($oup)"); &poly_mul(); $code.=" sub \$16, $itr1 lea 16($oup), $oup - jmp seal_sse_128_seal_hash + jmp .Lseal_sse_128_tail_hash -seal_sse_128_seal: +.Lseal_sse_128_tail_xor: cmp \$16, $inl - jb seal_sse_tail_16 + jb .Lseal_sse_tail_16 sub \$16, $inl # Load for decryption movdqu 0*16($inp), $T0 @@ -1092,22 +1134,22 @@ movdqa $B1, $A1 movdqa $C1, $B1 movdqa $D1, $C1 - jmp seal_sse_128_seal + jmp .Lseal_sse_128_tail_xor -seal_sse_tail_16: +.Lseal_sse_tail_16: test $inl, $inl - jz process_blocks_of_extra_in + jz .Lprocess_blocks_of_extra_in # We can only load the PT one byte at a time to avoid buffer overread mov $inl, $itr2 mov $inl, $itr1 - lea -1($inp, $inl), $inp + lea -1($inp,$inl), $inp pxor $T3, $T3 -1: +.Lseal_sse_tail_16_compose: pslldq \$1, $T3 pinsrb \$0, ($inp), $T3 lea -1($inp), $inp dec $itr1 - jne 1b + jne .Lseal_sse_tail_16_compose # XOR the keystream with the plaintext. pxor $A0, $T3 @@ -1115,12 +1157,12 @@ # Write ciphertext out, byte-by-byte. movq $inl, $itr1 movdqu $T3, $A0 -2: +.Lseal_sse_tail_16_extract: pextrb \$0, $A0, ($oup) psrldq \$1, $A0 add \$1, $oup sub \$1, $itr1 - jnz 2b + jnz .Lseal_sse_tail_16_extract # $T3 contains the final (partial, non-empty) block of ciphertext which # needs to be fed into the Poly1305 state. The right-most $inl bytes of it @@ -1129,23 +1171,23 @@ # # $keyp points to the tag output, which is actually a struct with the # extra_in pointer and length at offset 48. - movq 288+32(%rsp), $keyp + movq 288 + $xmm_storage + 32(%rsp), $keyp movq 56($keyp), $t1 # extra_in_len movq 48($keyp), $t0 # extra_in test $t1, $t1 - jz process_partial_block # Common case: no bytes of extra_in + jz .Lprocess_partial_block # Common case: no bytes of extra_in movq \$16, $t2 subq $inl, $t2 # 16-$inl is the number of bytes that fit into $T3. cmpq $t2, $t1 # if extra_in_len < 16-$inl, only copy extra_in_len # (note that AT&T syntax reverses the arguments) - jge load_extra_in + jge .Lload_extra_in movq $t1, $t2 -load_extra_in: +.Lload_extra_in: # $t2 contains the number of bytes of extra_in (pointed to by $t0) to load # into $T3. They are loaded in reverse order. - leaq -1($t0, $t2), $inp + leaq -1($t0,$t2), $inp # Update extra_in and extra_in_len to reflect the bytes that are about to # be read. addq $t2, $t0 @@ -1159,29 +1201,29 @@ # Load $t2 bytes of extra_in into $T2. pxor $T2, $T2 -3: +.Lload_extra_load_loop: pslldq \$1, $T2 pinsrb \$0, ($inp), $T2 lea -1($inp), $inp sub \$1, $t2 - jnz 3b + jnz .Lload_extra_load_loop # Shift $T2 up the length of the remainder from the main encryption. Sadly, # the shift for an XMM register has to be a constant, thus we loop to do # this. movq $inl, $t2 -4: +.Lload_extra_shift_loop: pslldq \$1, $T2 sub \$1, $t2 - jnz 4b + jnz .Lload_extra_shift_loop # Mask $T3 (the remainder from the main encryption) so that superfluous # bytes are zero. This means that the non-zero bytes in $T2 and $T3 are # disjoint and so we can merge them with an OR. - lea .and_masks(%rip), $t2 + lea .Land_masks(%rip), $t2 shl \$4, $inl - pand -16($t2, $inl), $T3 + pand -16($t2,$inl), $T3 # Merge $T2 into $T3, forming the remainder block. por $T2, $T3 @@ -1195,40 +1237,39 @@ adc \$1, $acc2\n"; &poly_mul(); $code.=" -process_blocks_of_extra_in: +.Lprocess_blocks_of_extra_in: # There may be additional bytes of extra_in to process. - movq 288+32(%rsp), $keyp + movq 288+32+$xmm_storage (%rsp), $keyp movq 48($keyp), $inp # extra_in movq 56($keyp), $itr2 # extra_in_len movq $itr2, $itr1 shr \$4, $itr2 # number of blocks -5: +.Lprocess_extra_hash_loop: jz process_extra_in_trailer\n"; &poly_add("0($inp)"); &poly_mul(); $code.=" leaq 16($inp), $inp subq \$1, $itr2 - jmp 5b - + jmp .Lprocess_extra_hash_loop process_extra_in_trailer: andq \$15, $itr1 # remaining num bytes (<16) of extra_in movq $itr1, $inl - jz do_length_block - leaq -1($inp, $itr1), $inp + jz .Ldo_length_block + leaq -1($inp,$itr1), $inp -6: +.Lprocess_extra_in_trailer_load: pslldq \$1, $T3 pinsrb \$0, ($inp), $T3 lea -1($inp), $inp sub \$1, $itr1 - jnz 6b + jnz .Lprocess_extra_in_trailer_load -process_partial_block: +.Lprocess_partial_block: # $T3 contains $inl bytes of data to be fed into Poly1305. $inl != 0 - lea .and_masks(%rip), $t2 + lea .Land_masks(%rip), $t2 shl \$4, $inl - pand -16($t2, $inl), $T3 + pand -16($t2,$inl), $T3 movq $T3, $t0 pextrq \$1, $T3, $t1 add $t0, $acc0 @@ -1236,7 +1277,7 @@ adc \$1, $acc2\n"; &poly_mul(); $code.=" -do_length_block:\n"; +.Ldo_length_block:\n"; &poly_add($len_store); &poly_mul(); $code.=" # Final reduce @@ -1251,40 +1292,54 @@ cmovc $t2, $acc2 # Add in s part of the key add 0+$s_store, $acc0 - adc 8+$s_store, $acc1 + adc 8+$s_store, $acc1\n"; - add \$288 + 32, %rsp +$code.=" + movaps 16*0+$xmm_store, %xmm6 + movaps 16*1+$xmm_store, %xmm7 + movaps 16*2+$xmm_store, %xmm8 + movaps 16*3+$xmm_store, %xmm9 + movaps 16*4+$xmm_store, %xmm10 + movaps 16*5+$xmm_store, %xmm11 + movaps 16*6+$xmm_store, %xmm12 + movaps 16*7+$xmm_store, %xmm13 + movaps 16*8+$xmm_store, %xmm14 + movaps 16*9+$xmm_store, %xmm15\n" if ($win64); +$code.=" +.cfi_remember_state + add \$288 + $xmm_storage + 32, %rsp .cfi_adjust_cfa_offset -(288 + 32) + # The tag replaces the key on return pop $keyp -.cfi_adjust_cfa_offset -8 - mov $acc0, 0*8($keyp) - mov $acc1, 1*8($keyp) - +.cfi_pop $keyp + mov $acc0, ($keyp) + mov $acc1, 8($keyp) pop %r15 -.cfi_adjust_cfa_offset -8 +.cfi_pop %r15 pop %r14 -.cfi_adjust_cfa_offset -8 +.cfi_pop %r14 pop %r13 -.cfi_adjust_cfa_offset -8 +.cfi_pop %r13 pop %r12 -.cfi_adjust_cfa_offset -8 +.cfi_pop %r12 pop %rbx -.cfi_adjust_cfa_offset -8 +.cfi_pop %rbx pop %rbp -.cfi_adjust_cfa_offset -8 +.cfi_pop %rbp ret -.cfi_adjust_cfa_offset (8 * 7) + 288 + 32 ################################################################################ -seal_sse_128: - movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2 +.Lseal_sse_128: +.cfi_restore_state + movdqu .Lchacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2 movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2 movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2 movdqu 2*16($keyp), $D2 - movdqa $D2, $D0\npaddd .sse_inc(%rip), $D0 - movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1 + movdqa $D2, $D0\npaddd .Lsse_inc(%rip), $D0 + movdqa $D0, $D1\npaddd .Lsse_inc(%rip), $D1 movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D0, $T3 mov \$10, $acc0 -1:\n"; + +.Lseal_sse_128_rounds:\n"; &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); @@ -1292,43 +1347,39 @@ &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" dec $acc0 - jnz 1b - paddd .chacha20_consts(%rip), $A0 - paddd .chacha20_consts(%rip), $A1 - paddd .chacha20_consts(%rip), $A2 + jnz .Lseal_sse_128_rounds + paddd .Lchacha20_consts(%rip), $A0 + paddd .Lchacha20_consts(%rip), $A1 + paddd .Lchacha20_consts(%rip), $A2 paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2 paddd $T2, $C0\npaddd $T2, $C1 paddd $T3, $D0 - paddd .sse_inc(%rip), $T3 + paddd .Lsse_inc(%rip), $T3 paddd $T3, $D1 # Clamp and store the key - pand .clamp(%rip), $A2 + pand .Lclamp(%rip), $A2 movdqa $A2, $r_store movdqa $B2, $s_store # Hash mov %r8, $itr2 call poly_hash_ad_internal - jmp seal_sse_128_seal -.size chacha20_poly1305_seal, .-chacha20_poly1305_seal\n"; + jmp .Lseal_sse_128_tail_xor +.size chacha20_poly1305_seal, .-chacha20_poly1305_seal +.cfi_endproc\n"; } -# There should have been a cfi_endproc at the end of that function, but the two -# following blocks of code are jumped to without a stack frame and the CFI -# context which they are used in happens to match the CFI context at the end of -# the previous function. So the CFI table is just extended to the end of them. - if ($avx>1) { ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%ymm$_",(0..15)); my ($A0x,$A1x,$A2x,$A3x,$B0x,$B1x,$B2x,$B3x,$C0x,$C1x,$C2x,$C3x,$D0x,$D1x,$D2x,$D3x)=map("%xmm$_",(0..15)); ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3); -$state1_store="2*32(%rbp)"; -$state2_store="3*32(%rbp)"; -$tmp_store="4*32(%rbp)"; -$ctr0_store="5*32(%rbp)"; -$ctr1_store="6*32(%rbp)"; -$ctr2_store="7*32(%rbp)"; -$ctr3_store="8*32(%rbp)"; +$state1_store="$xmm_storage+2*32(%rbp)"; +$state2_store="$xmm_storage+3*32(%rbp)"; +$tmp_store="$xmm_storage+4*32(%rbp)"; +$ctr0_store="$xmm_storage+5*32(%rbp)"; +$ctr1_store="$xmm_storage+6*32(%rbp)"; +$ctr2_store="$xmm_storage+7*32(%rbp)"; +$ctr3_store="$xmm_storage+8*32(%rbp)"; sub chacha_qr_avx2 { my ($a,$b,$c,$d,$t,$dir)=@_; @@ -1338,7 +1389,7 @@ $code.=<<___; vpaddd $b, $a, $a vpxor $a, $d, $d - vpshufb .rol16(%rip), $d, $d + vpshufb .Lrol16(%rip), $d, $d vpaddd $d, $c, $c vpxor $c, $b, $b vpsrld \$20, $b, $t @@ -1346,7 +1397,7 @@ vpxor $t, $b, $b vpaddd $b, $a, $a vpxor $a, $d, $d - vpshufb .rol8(%rip), $d, $d + vpshufb .Lrol8(%rip), $d, $d vpaddd $d, $c, $c vpxor $c, $b, $b vpslld \$7, $b, $t @@ -1371,7 +1422,7 @@ sub prep_state_avx2 { my ($n)=@_; $code.=<<___; - vmovdqa .chacha20_consts(%rip), $A0 + vmovdqa .Lchacha20_consts(%rip), $A0 vmovdqa $state1_store, $B0 vmovdqa $state2_store, $C0 ___ @@ -1391,19 +1442,19 @@ vmovdqa $C0, $C3 ___ $code.=<<___ if ($n eq 1); - vmovdqa .avx2_inc(%rip), $D0 + vmovdqa .Lavx2_inc(%rip), $D0 vpaddd $ctr0_store, $D0, $D0 vmovdqa $D0, $ctr0_store ___ $code.=<<___ if ($n eq 2); - vmovdqa .avx2_inc(%rip), $D0 + vmovdqa .Lavx2_inc(%rip), $D0 vpaddd $ctr0_store, $D0, $D1 vpaddd $D1, $D0, $D0 vmovdqa $D0, $ctr0_store vmovdqa $D1, $ctr1_store ___ $code.=<<___ if ($n eq 3); - vmovdqa .avx2_inc(%rip), $D0 + vmovdqa .Lavx2_inc(%rip), $D0 vpaddd $ctr0_store, $D0, $D2 vpaddd $D2, $D0, $D1 vpaddd $D1, $D0, $D0 @@ -1412,7 +1463,7 @@ vmovdqa $D2, $ctr2_store ___ $code.=<<___ if ($n eq 4); - vmovdqa .avx2_inc(%rip), $D0 + vmovdqa .Lavx2_inc(%rip), $D0 vpaddd $ctr0_store, $D0, $D3 vpaddd $D3, $D0, $D2 vpaddd $D2, $D0, $D1 @@ -1427,25 +1478,25 @@ sub finalize_state_avx2 { my ($n)=@_; $code.=<<___ if ($n eq 4); - vpaddd .chacha20_consts(%rip), $A3, $A3 + vpaddd .Lchacha20_consts(%rip), $A3, $A3 vpaddd $state1_store, $B3, $B3 vpaddd $state2_store, $C3, $C3 vpaddd $ctr3_store, $D3, $D3 ___ $code.=<<___ if ($n ge 3); - vpaddd .chacha20_consts(%rip), $A2, $A2 + vpaddd .Lchacha20_consts(%rip), $A2, $A2 vpaddd $state1_store, $B2, $B2 vpaddd $state2_store, $C2, $C2 vpaddd $ctr2_store, $D2, $D2 ___ $code.=<<___ if ($n ge 2); - vpaddd .chacha20_consts(%rip), $A1, $A1 + vpaddd .Lchacha20_consts(%rip), $A1, $A1 vpaddd $state1_store, $B1, $B1 vpaddd $state2_store, $C1, $C1 vpaddd $ctr1_store, $D1, $D1 ___ $code.=<<___; - vpaddd .chacha20_consts(%rip), $A0, $A0 + vpaddd .Lchacha20_consts(%rip), $A0, $A0 vpaddd $state1_store, $B0, $B0 vpaddd $state2_store, $C0, $C0 vpaddd $ctr0_store, $D0, $D0 @@ -1536,11 +1587,10 @@ vpshufb $C0, $D2, $D2 vpshufb $C0, $D1, $D1 vpshufb $C0, $D0, $D0 - vmovdqa $tmp_store, $C0 vpaddd $D3, $C3, $C3 vpaddd $D2, $C2, $C2 vpaddd $D1, $C1, $C1 - vpaddd $D0, $C0, $C0 + vpaddd $tmp_store, $D0, $C0 vpxor $C3, $B3, $B3 vpxor $C2, $B2, $B2 vpxor $C1, $B1, $B1 @@ -1577,77 +1627,90 @@ return $round; }; -$chacha_body = &gen_chacha_round_avx2(20, ".rol16(%rip)") . - &gen_chacha_round_avx2(25, ".rol8(%rip)", "left") . - &gen_chacha_round_avx2(20, ".rol16(%rip)") . - &gen_chacha_round_avx2(25, ".rol8(%rip)", "right"); +$chacha_body = &gen_chacha_round_avx2(20, ".Lrol16(%rip)") . + &gen_chacha_round_avx2(25, ".Lrol8(%rip)", "left") . + &gen_chacha_round_avx2(20, ".Lrol16(%rip)") . + &gen_chacha_round_avx2(25, ".Lrol8(%rip)", "right"); @loop_body = split /\n/, $chacha_body; $code.=" ############################################################################### -.type chacha20_poly1305_open_avx2,\@function,2 +.type chacha20_poly1305_open_avx2,\@abi-omnipotent .align 64 chacha20_poly1305_open_avx2: +.cfi_startproc + +# Since the AVX2 function operates in the frame of the SSE function, we just copy the frame state to over here +.cfi_push %rbp +.cfi_push %rbx +.cfi_push %r12 +.cfi_push %r13 +.cfi_push %r14 +.cfi_push %r15 +.cfi_push $keyp +.cfi_adjust_cfa_offset 288 + 32 + vzeroupper - vmovdqa .chacha20_consts(%rip), $A0 + vmovdqa .Lchacha20_consts(%rip), $A0 vbroadcasti128 0*16($keyp), $B0 vbroadcasti128 1*16($keyp), $C0 vbroadcasti128 2*16($keyp), $D0 - vpaddd .avx2_init(%rip), $D0, $D0 + vpaddd .Lavx2_init(%rip), $D0, $D0 cmp \$6*32, $inl - jbe open_avx2_192 + jbe .Lopen_avx2_192 cmp \$10*32, $inl - jbe open_avx2_320 + jbe .Lopen_avx2_320 vmovdqa $B0, $state1_store vmovdqa $C0, $state2_store vmovdqa $D0, $ctr0_store mov \$10, $acc0 -1: \n"; +.Lopen_avx2_init_rounds: \n"; &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.=" dec $acc0 - jne 1b - vpaddd .chacha20_consts(%rip), $A0, $A0 + jne .Lopen_avx2_init_rounds + vpaddd .Lchacha20_consts(%rip), $A0, $A0 vpaddd $state1_store, $B0, $B0 vpaddd $state2_store, $C0, $C0 vpaddd $ctr0_store, $D0, $D0 vperm2i128 \$0x02, $A0, $B0, $T0 # Clamp and store key - vpand .clamp(%rip), $T0, $T0 + vpand .Lclamp(%rip), $T0, $T0 vmovdqa $T0, $r_store # Stream for the first 64 bytes vperm2i128 \$0x13, $A0, $B0, $A0 vperm2i128 \$0x13, $C0, $D0, $B0 # Hash AD + first 64 bytes - mov %r8, $itr2 + mov $adl, $itr2 call poly_hash_ad_internal - xor $itr1, $itr1 # Hash first 64 bytes -1: \n"; - &poly_add("0($inp, $itr1)"); + xor $itr1, $itr1 +.Lopen_avx2_init_hash: \n"; + &poly_add("0($inp,$itr1)"); &poly_mul(); $code.=" add \$16, $itr1 cmp \$2*32, $itr1 - jne 1b + jne .Lopen_avx2_init_hash # Decrypt first 64 bytes vpxor 0*32($inp), $A0, $A0 vpxor 1*32($inp), $B0, $B0 + # Store first 64 bytes of decrypted data vmovdqu $A0, 0*32($oup) vmovdqu $B0, 1*32($oup) lea 2*32($inp), $inp lea 2*32($oup), $oup sub \$2*32, $inl -1: +.Lopen_avx2_main_loop: # Hash and decrypt 512 bytes each iteration cmp \$16*32, $inl - jb 3f\n"; - &prep_state_avx2(4); $code.=" + jb .Lopen_avx2_main_loop_done\n"; + &prep_state_avx2(4); $code.=" xor $itr1, $itr1 -2: \n"; - &poly_add("0*8($inp, $itr1)"); +.Lopen_avx2_main_loop_rounds: \n"; + &poly_add("0*8($inp,$itr1)"); &emit_body(10); &poly_stage1_mulx(); &emit_body(9); @@ -1657,7 +1720,7 @@ &emit_body(10); &poly_reduce_stage(); &emit_body(9); - &poly_add("2*8($inp, $itr1)"); + &poly_add("2*8($inp,$itr1)"); &emit_body(8); &poly_stage1_mulx(); &emit_body(18); @@ -1667,7 +1730,7 @@ &emit_body(9); &poly_reduce_stage(); &emit_body(8); - &poly_add("4*8($inp, $itr1)"); $code.=" + &poly_add("4*8($inp,$itr1)"); $code.=" lea 6*8($itr1), $itr1\n"; &emit_body(18); &poly_stage1_mulx(); @@ -1680,7 +1743,7 @@ foreach $l (@loop_body) {$code.=$l."\n";} @loop_body = split /\n/, $chacha_body; $code.=" cmp \$10*6*8, $itr1 - jne 2b\n"; + jne .Lopen_avx2_main_loop_rounds\n"; &finalize_state_avx2(4); $code.=" vmovdqa $A0, $tmp_store\n"; &poly_add("10*6*8($inp)"); @@ -1695,14 +1758,18 @@ lea 16*32($inp), $inp lea 16*32($oup), $oup sub \$16*32, $inl - jmp 1b -3: + jmp .Lopen_avx2_main_loop +.Lopen_avx2_main_loop_done: test $inl, $inl vzeroupper - je open_sse_finalize -3: + je .Lopen_sse_finalize + + cmp \$12*32, $inl + ja .Lopen_avx2_tail_512 + cmp \$8*32, $inl + ja .Lopen_avx2_tail_384 cmp \$4*32, $inl - ja 3f\n"; + ja .Lopen_avx2_tail_256\n"; ############################################################################### # 1-128 bytes left &prep_state_avx2(1); $code.=" @@ -1710,25 +1777,23 @@ mov $inl, $itr1 and \$-16, $itr1 test $itr1, $itr1 - je 2f -1: \n"; - &poly_add("0*8($inp, $itr2)"); + je .Lopen_avx2_tail_128_rounds # Have nothing to hash +.Lopen_avx2_tail_128_rounds_and_x1hash: \n"; + &poly_add("0*8($inp,$itr2)"); &poly_mul(); $code.=" -2: +.Lopen_avx2_tail_128_rounds: add \$16, $itr2\n"; &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.=" cmp $itr1, $itr2 - jb 1b + jb .Lopen_avx2_tail_128_rounds_and_x1hash cmp \$160, $itr2 - jne 2b\n"; + jne .Lopen_avx2_tail_128_rounds\n"; &finalize_state_avx2(1); &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" - jmp open_avx2_tail_loop -3: - cmp \$8*32, $inl - ja 3f\n"; + jmp .Lopen_avx2_tail_128_xor ############################################################################### +.Lopen_avx2_tail_256: \n"; # 129-256 bytes left &prep_state_avx2(2); $code.=" mov $inl, $tmp_store @@ -1740,11 +1805,11 @@ cmovg $itr2, $itr1 mov $inp, $inl xor $itr2, $itr2 -1: \n"; +.Lopen_avx2_tail_256_rounds_and_x1hash: \n"; &poly_add("0*8($inl)"); &poly_mul_mulx(); $code.=" lea 16($inl), $inl -2: \n"; +.Lopen_avx2_tail_256_rounds: \n"; &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); $code.=" inc $itr2\n"; @@ -1752,33 +1817,31 @@ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" cmp $itr1, $itr2 - jb 1b + jb .Lopen_avx2_tail_256_rounds_and_x1hash cmp \$10, $itr2 - jne 2b + jne .Lopen_avx2_tail_256_rounds mov $inl, $itr2 sub $inp, $inl mov $inl, $itr1 mov $tmp_store, $inl -1: +.Lopen_avx2_tail_256_hash: add \$16, $itr1 cmp $inl, $itr1 - jg 1f\n"; + jg .Lopen_avx2_tail_256_done\n"; &poly_add("0*8($itr2)"); &poly_mul_mulx(); $code.=" lea 16($itr2), $itr2 - jmp 1b -1: \n"; + jmp .Lopen_avx2_tail_256_hash +.Lopen_avx2_tail_256_done: \n"; &finalize_state_avx2(2); &xor_stream_avx2($A1, $B1, $C1, $D1, 0*32, $T0); &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.=" lea 4*32($inp), $inp lea 4*32($oup), $oup sub \$4*32, $inl - jmp open_avx2_tail_loop -3: - cmp \$12*32, $inl - ja 3f\n"; + jmp .Lopen_avx2_tail_128_xor ############################################################################### +.Lopen_avx2_tail_384: \n"; # 257-383 bytes left &prep_state_avx2(3); $code.=" mov $inl, $tmp_store @@ -1791,11 +1854,11 @@ cmovg $itr2, $itr1 mov $inp, $inl xor $itr2, $itr2 -1: \n"; +.Lopen_avx2_tail_384_rounds_and_x2hash: \n"; &poly_add("0*8($inl)"); &poly_mul_mulx(); $code.=" lea 16($inl), $inl -2: \n"; +.Lopen_avx2_tail_384_rounds_and_x1hash: \n"; &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); @@ -1807,22 +1870,22 @@ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.=" cmp $itr1, $itr2 - jb 1b + jb .Lopen_avx2_tail_384_rounds_and_x2hash cmp \$10, $itr2 - jne 2b + jne .Lopen_avx2_tail_384_rounds_and_x1hash mov $inl, $itr2 sub $inp, $inl mov $inl, $itr1 mov $tmp_store, $inl -1: +.Lopen_avx2_384_tail_hash: add \$16, $itr1 cmp $inl, $itr1 - jg 1f\n"; + jg .Lopen_avx2_384_tail_done\n"; &poly_add("0*8($itr2)"); &poly_mul_mulx(); $code.=" lea 16($itr2), $itr2 - jmp 1b -1: \n"; + jmp .Lopen_avx2_384_tail_hash +.Lopen_avx2_384_tail_done: \n"; &finalize_state_avx2(3); &xor_stream_avx2($A2, $B2, $C2, $D2, 0*32, $T0); &xor_stream_avx2($A1, $B1, $C1, $D1, 4*32, $T0); @@ -1830,18 +1893,18 @@ lea 8*32($inp), $inp lea 8*32($oup), $oup sub \$8*32, $inl - jmp open_avx2_tail_loop -3: \n"; + jmp .Lopen_avx2_tail_128_xor ############################################################################### +.Lopen_avx2_tail_512: \n"; # 384-512 bytes left &prep_state_avx2(4); $code.=" xor $itr1, $itr1 mov $inp, $itr2 -1: \n"; +.Lopen_avx2_tail_512_rounds_and_x2hash: \n"; &poly_add("0*8($itr2)"); &poly_mul(); $code.=" lea 2*8($itr2), $itr2 -2: \n"; +.Lopen_avx2_tail_512_rounds_and_x1hash: \n"; &emit_body(37); &poly_add("0*8($itr2)"); &poly_mul_mulx(); @@ -1853,21 +1916,21 @@ @loop_body = split /\n/, $chacha_body; $code.=" inc $itr1 cmp \$4, $itr1 - jl 1b + jl .Lopen_avx2_tail_512_rounds_and_x2hash cmp \$10, $itr1 - jne 2b + jne .Lopen_avx2_tail_512_rounds_and_x1hash mov $inl, $itr1 sub \$12*32, $itr1 and \$-16, $itr1 -1: +.Lopen_avx2_tail_512_hash: test $itr1, $itr1 - je 1f\n"; + je .Lopen_avx2_tail_512_done\n"; &poly_add("0*8($itr2)"); &poly_mul_mulx(); $code.=" lea 2*8($itr2), $itr2 sub \$2*8, $itr1 - jmp 1b -1: \n"; + jmp .Lopen_avx2_tail_512_hash +.Lopen_avx2_tail_512_done: \n"; &finalize_state_avx2(4); $code.=" vmovdqa $A0, $tmp_store\n"; &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" @@ -1878,9 +1941,9 @@ lea 12*32($inp), $inp lea 12*32($oup), $oup sub \$12*32, $inl -open_avx2_tail_loop: +.Lopen_avx2_tail_128_xor: cmp \$32, $inl - jb open_avx2_tail + jb .Lopen_avx2_tail_32_xor sub \$32, $inl vpxor ($inp), $A0, $A0 vmovdqu $A0, ($oup) @@ -1889,11 +1952,11 @@ vmovdqa $B0, $A0 vmovdqa $C0, $B0 vmovdqa $D0, $C0 - jmp open_avx2_tail_loop -open_avx2_tail: + jmp .Lopen_avx2_tail_128_xor +.Lopen_avx2_tail_32_xor: cmp \$16, $inl vmovdqa $A0x, $A1x - jb 1f + jb .Lopen_avx2_exit sub \$16, $inl #load for decryption vpxor ($inp), $A0x, $A1x @@ -1902,28 +1965,28 @@ lea 1*16($oup), $oup vperm2i128 \$0x11, $A0, $A0, $A0 vmovdqa $A0x, $A1x -1: +.Lopen_avx2_exit: vzeroupper - jmp open_sse_tail_16 + jmp .Lopen_sse_tail_16 ############################################################################### -open_avx2_192: +.Lopen_avx2_192: vmovdqa $A0, $A1 vmovdqa $A0, $A2 vmovdqa $B0, $B1 vmovdqa $B0, $B2 vmovdqa $C0, $C1 vmovdqa $C0, $C2 - vpaddd .avx2_inc(%rip), $D0, $D1 + vpaddd .Lavx2_inc(%rip), $D0, $D1 vmovdqa $D0, $T2 vmovdqa $D1, $T3 mov \$10, $acc0 -1: \n"; +.Lopen_avx2_192_rounds: \n"; &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.=" dec $acc0 - jne 1b + jne .Lopen_avx2_192_rounds vpaddd $A2, $A0, $A0 vpaddd $A2, $A1, $A1 vpaddd $B2, $B0, $B0 @@ -1934,7 +1997,7 @@ vpaddd $T3, $D1, $D1 vperm2i128 \$0x02, $A0, $B0, $T0 # Clamp and store the key - vpand .clamp(%rip), $T0, $T0 + vpand .Lclamp(%rip), $T0, $T0 vmovdqa $T0, $r_store # Stream for up to 192 bytes vperm2i128 \$0x13, $A0, $B0, $A0 @@ -1943,12 +2006,12 @@ vperm2i128 \$0x02, $C1, $D1, $D0 vperm2i128 \$0x13, $A1, $B1, $A1 vperm2i128 \$0x13, $C1, $D1, $B1 -open_avx2_short: - mov %r8, $itr2 +.Lopen_avx2_short: + mov $adl, $itr2 call poly_hash_ad_internal -open_avx2_hash_and_xor_loop: +.Lopen_avx2_short_hash_and_xor_loop: cmp \$32, $inl - jb open_avx2_short_tail_32 + jb .Lopen_avx2_short_tail_32 sub \$32, $inl\n"; # Load + hash &poly_add("0*8($inp)"); @@ -1970,11 +2033,11 @@ vmovdqa $D1, $C1 vmovdqa $A2, $D1 vmovdqa $B2, $A2 - jmp open_avx2_hash_and_xor_loop -open_avx2_short_tail_32: + jmp .Lopen_avx2_short_hash_and_xor_loop +.Lopen_avx2_short_tail_32: cmp \$16, $inl vmovdqa $A0x, $A1x - jb 1f + jb .Lopen_avx2_short_tail_32_exit sub \$16, $inl\n"; &poly_add("0*8($inp)"); &poly_mul(); $code.=" @@ -1983,26 +2046,26 @@ lea 1*16($inp), $inp lea 1*16($oup), $oup vextracti128 \$1, $A0, $A1x -1: +.Lopen_avx2_short_tail_32_exit: vzeroupper - jmp open_sse_tail_16 + jmp .Lopen_sse_tail_16 ############################################################################### -open_avx2_320: +.Lopen_avx2_320: vmovdqa $A0, $A1 vmovdqa $A0, $A2 vmovdqa $B0, $B1 vmovdqa $B0, $B2 vmovdqa $C0, $C1 vmovdqa $C0, $C2 - vpaddd .avx2_inc(%rip), $D0, $D1 - vpaddd .avx2_inc(%rip), $D1, $D2 + vpaddd .Lavx2_inc(%rip), $D0, $D1 + vpaddd .Lavx2_inc(%rip), $D1, $D2 vmovdqa $B0, $T1 vmovdqa $C0, $T2 vmovdqa $D0, $ctr0_store vmovdqa $D1, $ctr1_store vmovdqa $D2, $ctr2_store mov \$10, $acc0 -1: \n"; +.Lopen_avx2_320_rounds: \n"; &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); @@ -2010,10 +2073,10 @@ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" dec $acc0 - jne 1b - vpaddd .chacha20_consts(%rip), $A0, $A0 - vpaddd .chacha20_consts(%rip), $A1, $A1 - vpaddd .chacha20_consts(%rip), $A2, $A2 + jne .Lopen_avx2_320_rounds + vpaddd .Lchacha20_consts(%rip), $A0, $A0 + vpaddd .Lchacha20_consts(%rip), $A1, $A1 + vpaddd .Lchacha20_consts(%rip), $A2, $A2 vpaddd $T1, $B0, $B0 vpaddd $T1, $B1, $B1 vpaddd $T1, $B2, $B2 @@ -2025,7 +2088,7 @@ vpaddd $ctr2_store, $D2, $D2 vperm2i128 \$0x02, $A0, $B0, $T0 # Clamp and store the key - vpand .clamp(%rip), $T0, $T0 + vpand .Lclamp(%rip), $T0, $T0 vmovdqa $T0, $r_store # Stream for up to 320 bytes vperm2i128 \$0x13, $A0, $B0, $A0 @@ -2038,23 +2101,36 @@ vperm2i128 \$0x02, $C2, $D2, $D1 vperm2i128 \$0x13, $A2, $B2, $A2 vperm2i128 \$0x13, $C2, $D2, $B2 - jmp open_avx2_short + jmp .Lopen_avx2_short .size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2 +.cfi_endproc ############################################################################### ############################################################################### -.type chacha20_poly1305_seal_avx2,\@function,2 +.type chacha20_poly1305_seal_avx2,\@abi-omnipotent .align 64 chacha20_poly1305_seal_avx2: +.cfi_startproc + +# Since the AVX2 function operates in the frame of the SSE function, we just copy the frame state to over here +.cfi_push %rbp +.cfi_push %rbx +.cfi_push %r12 +.cfi_push %r13 +.cfi_push %r14 +.cfi_push %r15 +.cfi_push $keyp +.cfi_adjust_cfa_offset 288 + 32 + vzeroupper - vmovdqa .chacha20_consts(%rip), $A0 + vmovdqa .Lchacha20_consts(%rip), $A0 vbroadcasti128 0*16($keyp), $B0 vbroadcasti128 1*16($keyp), $C0 vbroadcasti128 2*16($keyp), $D0 - vpaddd .avx2_init(%rip), $D0, $D0 + vpaddd .Lavx2_init(%rip), $D0, $D0 cmp \$6*32, $inl - jbe seal_avx2_192 + jbe .Lseal_avx2_192 cmp \$10*32, $inl - jbe seal_avx2_320 + jbe .Lseal_avx2_320 vmovdqa $A0, $A1 vmovdqa $A0, $A2 vmovdqa $A0, $A3 @@ -2067,26 +2143,26 @@ vmovdqa $C0, $C3 vmovdqa $C0, $state2_store vmovdqa $D0, $D3 - vpaddd .avx2_inc(%rip), $D3, $D2 - vpaddd .avx2_inc(%rip), $D2, $D1 - vpaddd .avx2_inc(%rip), $D1, $D0 + vpaddd .Lavx2_inc(%rip), $D3, $D2 + vpaddd .Lavx2_inc(%rip), $D2, $D1 + vpaddd .Lavx2_inc(%rip), $D1, $D0 vmovdqa $D0, $ctr0_store vmovdqa $D1, $ctr1_store vmovdqa $D2, $ctr2_store vmovdqa $D3, $ctr3_store mov \$10, $acc0 -1: \n"; +.Lseal_avx2_init_rounds: \n"; foreach $l (@loop_body) {$code.=$l."\n";} @loop_body = split /\n/, $chacha_body; $code.=" dec $acc0 - jnz 1b\n"; + jnz .Lseal_avx2_init_rounds\n"; &finalize_state_avx2(4); $code.=" vperm2i128 \$0x13, $C3, $D3, $C3 vperm2i128 \$0x02, $A3, $B3, $D3 vperm2i128 \$0x13, $A3, $B3, $A3 - vpand .clamp(%rip), $D3, $D3 + vpand .Lclamp(%rip), $D3, $D3 vmovdqa $D3, $r_store - mov %r8, $itr2 + mov $adl, $itr2 call poly_hash_ad_internal # Safely store 320 bytes (otherwise would handle with optimized call) vpxor 0*32($inp), $A3, $A3 @@ -2100,7 +2176,7 @@ sub \$10*32, $inl mov \$10*32, $itr1 cmp \$4*32, $inl - jbe seal_avx2_hash + jbe .Lseal_avx2_short_hash_remainder vpxor 0*32($inp), $A0, $A0 vpxor 1*32($inp), $B0, $B0 vpxor 2*32($inp), $C0, $C0 @@ -2114,13 +2190,13 @@ mov \$8, $itr1 mov \$2, $itr2 cmp \$4*32, $inl - jbe seal_avx2_tail_128 + jbe .Lseal_avx2_tail_128 cmp \$8*32, $inl - jbe seal_avx2_tail_256 + jbe .Lseal_avx2_tail_256 cmp \$12*32, $inl - jbe seal_avx2_tail_384 + jbe .Lseal_avx2_tail_384 cmp \$16*32, $inl - jbe seal_avx2_tail_512\n"; + jbe .Lseal_avx2_tail_512\n"; # We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop &prep_state_avx2(4); foreach $l (@loop_body) {$code.=$l."\n";} @@ -2129,11 +2205,13 @@ @loop_body = split /\n/, $chacha_body; $code.=" sub \$16, $oup mov \$9, $itr1 - jmp 4f -1: \n"; + jmp .Lseal_avx2_main_loop_rounds_entry +.align 32 +.Lseal_avx2_main_loop: \n"; &prep_state_avx2(4); $code.=" mov \$10, $itr1 -2: \n"; +.align 32 +.Lseal_avx2_main_loop_rounds: \n"; &poly_add("0*8($oup)"); &emit_body(10); &poly_stage1_mulx(); @@ -2143,7 +2221,7 @@ &poly_stage3_mulx(); &emit_body(10); &poly_reduce_stage(); $code.=" -4: \n"; +.Lseal_avx2_main_loop_rounds_entry: \n"; &emit_body(9); &poly_add("2*8($oup)"); &emit_body(8); @@ -2168,65 +2246,68 @@ foreach $l (@loop_body) {$code.=$l."\n";} @loop_body = split /\n/, $chacha_body; $code.=" dec $itr1 - jne 2b\n"; + jne .Lseal_avx2_main_loop_rounds\n"; &finalize_state_avx2(4); $code.=" - lea 4*8($oup), $oup vmovdqa $A0, $tmp_store\n"; - &poly_add("-4*8($oup)"); + &poly_add("0*8($oup)"); + &poly_mul_mulx(); + &poly_add("2*8($oup)"); + &poly_mul_mulx(); $code.=" + lea 4*8($oup), $oup\n"; &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" vmovdqa $tmp_store, $A0\n"; - &poly_mul(); &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3); - &poly_add("-2*8($oup)"); &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3); - &poly_mul(); &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.=" lea 16*32($inp), $inp sub \$16*32, $inl cmp \$16*32, $inl - jg 1b\n"; + jg .Lseal_avx2_main_loop +\n"; &poly_add("0*8($oup)"); - &poly_mul(); + &poly_mul_mulx(); &poly_add("2*8($oup)"); - &poly_mul(); $code.=" + &poly_mul_mulx(); $code.=" lea 4*8($oup), $oup mov \$10, $itr1 xor $itr2, $itr2 + + cmp \$12*32, $inl + ja .Lseal_avx2_tail_512 + cmp \$8*32, $inl + ja .Lseal_avx2_tail_384 cmp \$4*32, $inl - ja 3f + ja .Lseal_avx2_tail_256 ############################################################################### -seal_avx2_tail_128:\n"; +.Lseal_avx2_tail_128:\n"; &prep_state_avx2(1); $code.=" -1: \n"; +.Lseal_avx2_tail_128_rounds_and_3xhash: \n"; &poly_add("0($oup)"); - &poly_mul(); $code.=" + &poly_mul_mulx(); $code.=" lea 2*8($oup), $oup -2: \n"; +.Lseal_avx2_tail_128_rounds_and_2xhash: \n"; &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); &poly_add("0*8($oup)"); - &poly_mul(); + &poly_mul_mulx(); &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); &poly_add("2*8($oup)"); - &poly_mul(); $code.=" + &poly_mul_mulx(); $code.=" lea 4*8($oup), $oup dec $itr1 - jg 1b + jg .Lseal_avx2_tail_128_rounds_and_3xhash dec $itr2 - jge 2b\n"; + jge .Lseal_avx2_tail_128_rounds_and_2xhash\n"; &finalize_state_avx2(1); &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" - jmp seal_avx2_short_loop -3: - cmp \$8*32, $inl - ja 3f + jmp .Lseal_avx2_short_loop ############################################################################### -seal_avx2_tail_256:\n"; +.Lseal_avx2_tail_256:\n"; &prep_state_avx2(2); $code.=" -1: \n"; +.Lseal_avx2_tail_256_rounds_and_3xhash: \n"; &poly_add("0($oup)"); &poly_mul(); $code.=" lea 2*8($oup), $oup -2: \n"; +.Lseal_avx2_tail_256_rounds_and_2xhash: \n"; &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); &poly_add("0*8($oup)"); @@ -2237,27 +2318,24 @@ &poly_mul(); $code.=" lea 4*8($oup), $oup dec $itr1 - jg 1b + jg .Lseal_avx2_tail_256_rounds_and_3xhash dec $itr2 - jge 2b\n"; + jge .Lseal_avx2_tail_256_rounds_and_2xhash\n"; &finalize_state_avx2(2); &xor_stream_avx2($A1,$B1,$C1,$D1,0*32,$T0); &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" mov \$4*32, $itr1 lea 4*32($inp), $inp sub \$4*32, $inl - jmp seal_avx2_hash -3: - cmp \$12*32, $inl - ja seal_avx2_tail_512 + jmp .Lseal_avx2_short_hash_remainder ############################################################################### -seal_avx2_tail_384:\n"; +.Lseal_avx2_tail_384:\n"; &prep_state_avx2(3); $code.=" -1: \n"; +.Lseal_avx2_tail_384_rounds_and_3xhash: \n"; &poly_add("0($oup)"); &poly_mul(); $code.=" lea 2*8($oup), $oup -2: \n"; +.Lseal_avx2_tail_384_rounds_and_2xhash: \n"; &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); &poly_add("0*8($oup)"); @@ -2270,9 +2348,9 @@ &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" lea 4*8($oup), $oup dec $itr1 - jg 1b + jg .Lseal_avx2_tail_384_rounds_and_3xhash dec $itr2 - jge 2b\n"; + jge .Lseal_avx2_tail_384_rounds_and_2xhash\n"; &finalize_state_avx2(3); &xor_stream_avx2($A2,$B2,$C2,$D2,0*32,$T0); &xor_stream_avx2($A1,$B1,$C1,$D1,4*32,$T0); @@ -2280,15 +2358,15 @@ mov \$8*32, $itr1 lea 8*32($inp), $inp sub \$8*32, $inl - jmp seal_avx2_hash + jmp .Lseal_avx2_short_hash_remainder ############################################################################### -seal_avx2_tail_512:\n"; +.Lseal_avx2_tail_512:\n"; &prep_state_avx2(4); $code.=" -1: \n"; +.Lseal_avx2_tail_512_rounds_and_3xhash: \n"; &poly_add("0($oup)"); &poly_mul_mulx(); $code.=" lea 2*8($oup), $oup -2: \n"; +.Lseal_avx2_tail_512_rounds_and_2xhash: \n"; &emit_body(20); &poly_add("0*8($oup)"); &emit_body(20); @@ -2313,9 +2391,9 @@ @loop_body = split /\n/, $chacha_body; $code.=" lea 4*8($oup), $oup dec $itr1 - jg 1b + jg .Lseal_avx2_tail_512_rounds_and_3xhash dec $itr2 - jge 2b\n"; + jge .Lseal_avx2_tail_512_rounds_and_2xhash\n"; &finalize_state_avx2(4); $code.=" vmovdqa $A0, $tmp_store\n"; &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" @@ -2326,24 +2404,24 @@ mov \$12*32, $itr1 lea 12*32($inp), $inp sub \$12*32, $inl - jmp seal_avx2_hash + jmp .Lseal_avx2_short_hash_remainder ################################################################################ -seal_avx2_320: +.Lseal_avx2_320: vmovdqa $A0, $A1 vmovdqa $A0, $A2 vmovdqa $B0, $B1 vmovdqa $B0, $B2 vmovdqa $C0, $C1 vmovdqa $C0, $C2 - vpaddd .avx2_inc(%rip), $D0, $D1 - vpaddd .avx2_inc(%rip), $D1, $D2 + vpaddd .Lavx2_inc(%rip), $D0, $D1 + vpaddd .Lavx2_inc(%rip), $D1, $D2 vmovdqa $B0, $T1 vmovdqa $C0, $T2 vmovdqa $D0, $ctr0_store vmovdqa $D1, $ctr1_store vmovdqa $D2, $ctr2_store mov \$10, $acc0 -1: \n"; +.Lseal_avx2_320_rounds: \n"; &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); @@ -2351,10 +2429,10 @@ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" dec $acc0 - jne 1b - vpaddd .chacha20_consts(%rip), $A0, $A0 - vpaddd .chacha20_consts(%rip), $A1, $A1 - vpaddd .chacha20_consts(%rip), $A2, $A2 + jne .Lseal_avx2_320_rounds + vpaddd .Lchacha20_consts(%rip), $A0, $A0 + vpaddd .Lchacha20_consts(%rip), $A1, $A1 + vpaddd .Lchacha20_consts(%rip), $A2, $A2 vpaddd $T1, $B0, $B0 vpaddd $T1, $B1, $B1 vpaddd $T1, $B2, $B2 @@ -2366,7 +2444,7 @@ vpaddd $ctr2_store, $D2, $D2 vperm2i128 \$0x02, $A0, $B0, $T0 # Clamp and store the key - vpand .clamp(%rip), $T0, $T0 + vpand .Lclamp(%rip), $T0, $T0 vmovdqa $T0, $r_store # Stream for up to 320 bytes vperm2i128 \$0x13, $A0, $B0, $A0 @@ -2379,26 +2457,26 @@ vperm2i128 \$0x02, $C2, $D2, $D1 vperm2i128 \$0x13, $A2, $B2, $A2 vperm2i128 \$0x13, $C2, $D2, $B2 - jmp seal_avx2_short + jmp .Lseal_avx2_short ################################################################################ -seal_avx2_192: +.Lseal_avx2_192: vmovdqa $A0, $A1 vmovdqa $A0, $A2 vmovdqa $B0, $B1 vmovdqa $B0, $B2 vmovdqa $C0, $C1 vmovdqa $C0, $C2 - vpaddd .avx2_inc(%rip), $D0, $D1 + vpaddd .Lavx2_inc(%rip), $D0, $D1 vmovdqa $D0, $T2 vmovdqa $D1, $T3 mov \$10, $acc0 -1: \n"; +.Lseal_avx2_192_rounds: \n"; &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.=" dec $acc0 - jne 1b + jne .Lseal_avx2_192_rounds vpaddd $A2, $A0, $A0 vpaddd $A2, $A1, $A1 vpaddd $B2, $B0, $B0 @@ -2409,7 +2487,7 @@ vpaddd $T3, $D1, $D1 vperm2i128 \$0x02, $A0, $B0, $T0 # Clamp and store the key - vpand .clamp(%rip), $T0, $T0 + vpand .Lclamp(%rip), $T0, $T0 vmovdqa $T0, $r_store # Stream for up to 192 bytes vperm2i128 \$0x13, $A0, $B0, $A0 @@ -2418,21 +2496,21 @@ vperm2i128 \$0x02, $C1, $D1, $D0 vperm2i128 \$0x13, $A1, $B1, $A1 vperm2i128 \$0x13, $C1, $D1, $B1 -seal_avx2_short: - mov %r8, $itr2 +.Lseal_avx2_short: + mov $adl, $itr2 call poly_hash_ad_internal xor $itr1, $itr1 -seal_avx2_hash: +.Lseal_avx2_short_hash_remainder: cmp \$16, $itr1 - jb seal_avx2_short_loop\n"; + jb .Lseal_avx2_short_loop\n"; &poly_add("0($oup)"); &poly_mul(); $code.=" sub \$16, $itr1 add \$16, $oup - jmp seal_avx2_hash -seal_avx2_short_loop: + jmp .Lseal_avx2_short_hash_remainder +.Lseal_avx2_short_loop: cmp \$32, $inl - jb seal_avx2_short_tail + jb .Lseal_avx2_short_tail sub \$32, $inl # Encrypt vpxor ($inp), $A0, $A0 @@ -2454,10 +2532,10 @@ vmovdqa $D1, $C1 vmovdqa $A2, $D1 vmovdqa $B2, $A2 - jmp seal_avx2_short_loop -seal_avx2_short_tail: + jmp .Lseal_avx2_short_loop +.Lseal_avx2_short_tail: cmp \$16, $inl - jb 1f + jb .Lseal_avx2_exit sub \$16, $inl vpxor ($inp), $A0x, $A3x vmovdqu $A3x, ($oup) @@ -2466,24 +2544,16 @@ &poly_mul(); $code.=" lea 1*16($oup), $oup vextracti128 \$1, $A0, $A0x -1: +.Lseal_avx2_exit: vzeroupper - jmp seal_sse_tail_16 + jmp .Lseal_sse_tail_16 .cfi_endproc +.size chacha20_poly1305_seal_avx2, .-chacha20_poly1305_seal_avx2 "; } -if (!$win64) { - $code =~ s/\`([^\`]*)\`/eval $1/gem; - print $code; -} else { - print <<___; -.text -.globl dummy_chacha20_poly1305_asm -.type dummy_chacha20_poly1305_asm,\@abi-omnipotent -dummy_chacha20_poly1305_asm: - ret -___ -} +$code =~ s/\`([^\`]*)\`/eval $1/gem; + +print $code; close STDOUT or die "error closing STDOUT";
diff --git a/crypto/cipher_extra/e_chacha20poly1305.c b/crypto/cipher_extra/e_chacha20poly1305.c index 1c175e9..1650188 100644 --- a/crypto/cipher_extra/e_chacha20poly1305.c +++ b/crypto/cipher_extra/e_chacha20poly1305.c
@@ -18,18 +18,15 @@ #include <openssl/chacha.h> #include <openssl/cipher.h> -#include <openssl/cpu.h> #include <openssl/err.h> #include <openssl/mem.h> #include <openssl/poly1305.h> #include <openssl/type_check.h> +#include "internal.h" +#include "../chacha/internal.h" #include "../fipsmodule/cipher/internal.h" #include "../internal.h" -#include "../chacha/internal.h" - - -#define POLY1305_TAG_LEN 16 struct aead_chacha20_poly1305_ctx { uint8_t key[32]; @@ -44,78 +41,6 @@ "AEAD state has insufficient alignment"); #endif -// For convenience (the x86_64 calling convention allows only six parameters in -// registers), the final parameter for the assembly functions is both an input -// and output parameter. -union open_data { - struct { - alignas(16) uint8_t key[32]; - uint32_t counter; - uint8_t nonce[12]; - } in; - struct { - uint8_t tag[POLY1305_TAG_LEN]; - } out; -}; - -union seal_data { - struct { - alignas(16) uint8_t key[32]; - uint32_t counter; - uint8_t nonce[12]; - const uint8_t *extra_ciphertext; - size_t extra_ciphertext_len; - } in; - struct { - uint8_t tag[POLY1305_TAG_LEN]; - } out; -}; - -#if defined(OPENSSL_X86_64) && !defined(OPENSSL_NO_ASM) && \ - !defined(OPENSSL_WINDOWS) -static int asm_capable(void) { - const int sse41_capable = (OPENSSL_ia32cap_P[1] & (1 << 19)) != 0; - return sse41_capable; -} - -OPENSSL_STATIC_ASSERT(sizeof(union open_data) == 48, "wrong open_data size"); -OPENSSL_STATIC_ASSERT(sizeof(union seal_data) == 48 + 8 + 8, - "wrong seal_data size"); - -// chacha20_poly1305_open is defined in chacha20_poly1305_x86_64.pl. It decrypts -// |plaintext_len| bytes from |ciphertext| and writes them to |out_plaintext|. -// Additional input parameters are passed in |aead_data->in|. On exit, it will -// write calculated tag value to |aead_data->out.tag|, which the caller must -// check. -extern void chacha20_poly1305_open(uint8_t *out_plaintext, - const uint8_t *ciphertext, - size_t plaintext_len, const uint8_t *ad, - size_t ad_len, union open_data *aead_data); - -// chacha20_poly1305_open is defined in chacha20_poly1305_x86_64.pl. It encrypts -// |plaintext_len| bytes from |plaintext| and writes them to |out_ciphertext|. -// Additional input parameters are passed in |aead_data->in|. The calculated tag -// value is over the computed ciphertext concatenated with |extra_ciphertext| -// and written to |aead_data->out.tag|. -extern void chacha20_poly1305_seal(uint8_t *out_ciphertext, - const uint8_t *plaintext, - size_t plaintext_len, const uint8_t *ad, - size_t ad_len, union seal_data *aead_data); -#else -static int asm_capable(void) { return 0; } - - -static void chacha20_poly1305_open(uint8_t *out_plaintext, - const uint8_t *ciphertext, - size_t plaintext_len, const uint8_t *ad, - size_t ad_len, union open_data *aead_data) {} - -static void chacha20_poly1305_seal(uint8_t *out_ciphertext, - const uint8_t *plaintext, - size_t plaintext_len, const uint8_t *ad, - size_t ad_len, union seal_data *aead_data) {} -#endif - static int aead_chacha20_poly1305_init(EVP_AEAD_CTX *ctx, const uint8_t *key, size_t key_len, size_t tag_len) { struct aead_chacha20_poly1305_ctx *c20_ctx = @@ -238,8 +163,8 @@ } } - union seal_data data; - if (asm_capable()) { + union chacha20_poly1305_seal_data data; + if (chacha20_poly1305_asm_capable()) { OPENSSL_memcpy(data.in.key, key, 32); data.in.counter = 0; OPENSSL_memcpy(data.in.nonce, nonce, 12); @@ -321,8 +246,8 @@ return 0; } - union open_data data; - if (asm_capable()) { + union chacha20_poly1305_open_data data; + if (chacha20_poly1305_asm_capable()) { OPENSSL_memcpy(data.in.key, key, 32); data.in.counter = 0; OPENSSL_memcpy(data.in.nonce, nonce, 12);
diff --git a/crypto/cipher_extra/internal.h b/crypto/cipher_extra/internal.h index 1d2c4e1..c2af48e 100644 --- a/crypto/cipher_extra/internal.h +++ b/crypto/cipher_extra/internal.h
@@ -57,7 +57,11 @@ #ifndef OPENSSL_HEADER_CIPHER_EXTRA_INTERNAL_H #define OPENSSL_HEADER_CIPHER_EXTRA_INTERNAL_H +#include <stdlib.h> + #include <openssl/base.h> +#include <openssl/cpu.h> +#include <openssl/type_check.h> #include "../internal.h" @@ -120,6 +124,89 @@ const uint8_t *mac_secret, unsigned mac_secret_length); +#define POLY1305_TAG_LEN 16 + +// For convenience (the x86_64 calling convention allows only six parameters in +// registers), the final parameter for the assembly functions is both an input +// and output parameter. +union chacha20_poly1305_open_data { + struct { + alignas(16) uint8_t key[32]; + uint32_t counter; + uint8_t nonce[12]; + } in; + struct { + uint8_t tag[POLY1305_TAG_LEN]; + } out; +}; + +union chacha20_poly1305_seal_data { + struct { + alignas(16) uint8_t key[32]; + uint32_t counter; + uint8_t nonce[12]; + const uint8_t *extra_ciphertext; + size_t extra_ciphertext_len; + } in; + struct { + uint8_t tag[POLY1305_TAG_LEN]; + } out; +}; + +#if defined(OPENSSL_X86_64) && !defined(OPENSSL_NO_ASM) + +OPENSSL_STATIC_ASSERT(sizeof(union chacha20_poly1305_open_data) == 48, + "wrong chacha20_poly1305_open_data size"); +OPENSSL_STATIC_ASSERT(sizeof(union chacha20_poly1305_seal_data) == 48 + 8 + 8, + "wrong chacha20_poly1305_seal_data size"); + +OPENSSL_INLINE int chacha20_poly1305_asm_capable(void) { + const int sse41_capable = (OPENSSL_ia32cap_P[1] & (1 << 19)) != 0; + return sse41_capable; +} + +// chacha20_poly1305_open is defined in chacha20_poly1305_x86_64.pl. It decrypts +// |plaintext_len| bytes from |ciphertext| and writes them to |out_plaintext|. +// Additional input parameters are passed in |aead_data->in|. On exit, it will +// write calculated tag value to |aead_data->out.tag|, which the caller must +// check. +extern void chacha20_poly1305_open(uint8_t *out_plaintext, + const uint8_t *ciphertext, + size_t plaintext_len, const uint8_t *ad, + size_t ad_len, + union chacha20_poly1305_open_data *data); + +// chacha20_poly1305_open is defined in chacha20_poly1305_x86_64.pl. It encrypts +// |plaintext_len| bytes from |plaintext| and writes them to |out_ciphertext|. +// Additional input parameters are passed in |aead_data->in|. The calculated tag +// value is over the computed ciphertext concatenated with |extra_ciphertext| +// and written to |aead_data->out.tag|. +extern void chacha20_poly1305_seal(uint8_t *out_ciphertext, + const uint8_t *plaintext, + size_t plaintext_len, const uint8_t *ad, + size_t ad_len, + union chacha20_poly1305_seal_data *data); +#else + +OPENSSL_INLINE int chacha20_poly1305_asm_capable(void) { return 0; } + +OPENSSL_INLINE void chacha20_poly1305_open(uint8_t *out_plaintext, + const uint8_t *ciphertext, + size_t plaintext_len, const uint8_t *ad, + size_t ad_len, + union chacha20_poly1305_open_data *data) { + abort(); +} + +OPENSSL_INLINE void chacha20_poly1305_seal(uint8_t *out_ciphertext, + const uint8_t *plaintext, + size_t plaintext_len, const uint8_t *ad, + size_t ad_len, + union chacha20_poly1305_seal_data *data) { + abort(); +} +#endif + #if defined(__cplusplus) } // extern C