blob: fb117607797974b959c369b4279118151a13fbd0 [file] [log] [blame]
#!/usr/bin/env perl
# Copyright (c) 2015, CloudFlare Ltd.
#
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
##############################################################################
# #
# Author: Vlad Krasnov #
# #
##############################################################################
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
$avx = 2;
$code.=<<___;
.text
.extern OPENSSL_ia32cap_P
chacha20_poly1305_constants:
.section .rodata
.align 64
.Lchacha20_consts:
.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
.Lrol8:
.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
.Lrol16:
.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
.Lavx2_init:
.long 0,0,0,0
.Lsse_inc:
.long 1,0,0,0
.Lavx2_inc:
.long 2,0,0,0,2,0,0,0
.Lclamp:
.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
.align 16
.Land_masks:
.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.text
___
my ($oup,$inp,$inl,$adp,$keyp,$itr1,$itr2,$adl)=("%rdi","%rsi","%rbx","%rcx","%r9","%rcx","%r8","%r8");
my ($acc0,$acc1,$acc2)=map("%r$_",(10..12));
my ($t0,$t1,$t2,$t3)=("%r13","%r14","%r15","%r9");
my ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%xmm$_",(0..15));
my ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3);
my $xmm_storage = 0;
if ($win64) {
$xmm_storage = 10*16;
}
my $xmm_store="0*16(%rbp)";
my $r_store="$xmm_storage+0*16(%rbp)";
my $s_store="$xmm_storage+1*16(%rbp)";
my $len_store="$xmm_storage+2*16(%rbp)";
my $state1_store="$xmm_storage+3*16(%rbp)";
my $state2_store="$xmm_storage+4*16(%rbp)";
my $tmp_store="$xmm_storage+5*16(%rbp)";
my $ctr0_store="$xmm_storage+6*16(%rbp)";
my $ctr1_store="$xmm_storage+7*16(%rbp)";
my $ctr2_store="$xmm_storage+8*16(%rbp)";
my $ctr3_store="$xmm_storage+9*16(%rbp)";
sub chacha_qr {
my ($a,$b,$c,$d,$t,$dir)=@_;
$code.="movdqa $t, $tmp_store\n" if ($dir =~ /store/);
$code.="paddd $b, $a
pxor $a, $d
pshufb .Lrol16(%rip), $d
paddd $d, $c
pxor $c, $b
movdqa $b, $t
pslld \$12, $t
psrld \$20, $b
pxor $t, $b
paddd $b, $a
pxor $a, $d
pshufb .Lrol8(%rip), $d
paddd $d, $c
pxor $c, $b
movdqa $b, $t
pslld \$7, $t
psrld \$25, $b
pxor $t, $b\n";
$code.="palignr \$4, $b, $b
palignr \$8, $c, $c
palignr \$12, $d, $d\n" if ($dir =~ /left/);
$code.="palignr \$12, $b, $b
palignr \$8, $c, $c
palignr \$4, $d, $d\n" if ($dir =~ /right/);
$code.="movdqa $tmp_store, $t\n" if ($dir =~ /load/);
}
sub poly_add {
my ($src)=@_;
$code.="add 0+$src, $acc0
adc 8+$src, $acc1
adc \$1, $acc2\n";
}
sub poly_stage1 {
$code.="mov 0+$r_store, %rax
mov %rax, $t2
mul $acc0
mov %rax, $t0
mov %rdx, $t1
mov 0+$r_store, %rax
mul $acc1
imulq $acc2, $t2
add %rax, $t1
adc %rdx, $t2\n";
}
sub poly_stage2 {
$code.="mov 8+$r_store, %rax
mov %rax, $t3
mul $acc0
add %rax, $t1
adc \$0, %rdx
mov %rdx, $acc0
mov 8+$r_store, %rax
mul $acc1
add %rax, $t2
adc \$0, %rdx\n";
}
sub poly_stage3 {
$code.="imulq $acc2, $t3
add $acc0, $t2
adc %rdx, $t3\n";
}
# At the beginning of the reduce stage t = [t3:t2:t1:t0] is a product of
# r = [r1:r0] and acc = [acc2:acc1:acc0]
# r is 124 bits at most (due to clamping) and acc is 131 bits at most
# (acc2 is at most 4 before the addition and can be at most 6 when we add in
# the next block) therefore t is at most 255 bits big, and t3 is 63 bits.
sub poly_reduce_stage {
$code.="mov $t0, $acc0
mov $t1, $acc1
mov $t2, $acc2
and \$3, $acc2 # At this point acc2 is 2 bits at most (value of 3)
mov $t2, $t0
and \$-4, $t0
mov $t3, $t1
shrd \$2, $t3, $t2
shr \$2, $t3
add $t0, $t2
adc $t1, $t3 # No carry out since t3 is 61 bits and t1 is 63 bits
add $t2, $acc0
adc $t3, $acc1
adc \$0, $acc2\n"; # At this point acc2 has the value of 4 at most
}
sub poly_mul {
&poly_stage1();
&poly_stage2();
&poly_stage3();
&poly_reduce_stage();
}
sub prep_state {
my ($n)=@_;
$code.="movdqa .Lchacha20_consts(%rip), $A0
movdqa $state1_store, $B0
movdqa $state2_store, $C0\n";
$code.="movdqa $A0, $A1
movdqa $B0, $B1
movdqa $C0, $C1\n" if ($n ge 2);
$code.="movdqa $A0, $A2
movdqa $B0, $B2
movdqa $C0, $C2\n" if ($n ge 3);
$code.="movdqa $A0, $A3
movdqa $B0, $B3
movdqa $C0, $C3\n" if ($n ge 4);
$code.="movdqa $ctr0_store, $D0
paddd .Lsse_inc(%rip), $D0
movdqa $D0, $ctr0_store\n" if ($n eq 1);
$code.="movdqa $ctr0_store, $D1
paddd .Lsse_inc(%rip), $D1
movdqa $D1, $D0
paddd .Lsse_inc(%rip), $D0
movdqa $D0, $ctr0_store
movdqa $D1, $ctr1_store\n" if ($n eq 2);
$code.="movdqa $ctr0_store, $D2
paddd .Lsse_inc(%rip), $D2
movdqa $D2, $D1
paddd .Lsse_inc(%rip), $D1
movdqa $D1, $D0
paddd .Lsse_inc(%rip), $D0
movdqa $D0, $ctr0_store
movdqa $D1, $ctr1_store
movdqa $D2, $ctr2_store\n" if ($n eq 3);
$code.="movdqa $ctr0_store, $D3
paddd .Lsse_inc(%rip), $D3
movdqa $D3, $D2
paddd .Lsse_inc(%rip), $D2
movdqa $D2, $D1
paddd .Lsse_inc(%rip), $D1
movdqa $D1, $D0
paddd .Lsse_inc(%rip), $D0
movdqa $D0, $ctr0_store
movdqa $D1, $ctr1_store
movdqa $D2, $ctr2_store
movdqa $D3, $ctr3_store\n" if ($n eq 4);
}
sub finalize_state {
my ($n)=@_;
$code.="paddd .Lchacha20_consts(%rip), $A3
paddd $state1_store, $B3
paddd $state2_store, $C3
paddd $ctr3_store, $D3\n" if ($n eq 4);
$code.="paddd .Lchacha20_consts(%rip), $A2
paddd $state1_store, $B2
paddd $state2_store, $C2
paddd $ctr2_store, $D2\n" if ($n ge 3);
$code.="paddd .Lchacha20_consts(%rip), $A1
paddd $state1_store, $B1
paddd $state2_store, $C1
paddd $ctr1_store, $D1\n" if ($n ge 2);
$code.="paddd .Lchacha20_consts(%rip), $A0
paddd $state1_store, $B0
paddd $state2_store, $C0
paddd $ctr0_store, $D0\n";
}
sub xor_stream {
my ($A, $B, $C, $D, $offset)=@_;
$code.="movdqu 0*16 + $offset($inp), $A3
movdqu 1*16 + $offset($inp), $B3
movdqu 2*16 + $offset($inp), $C3
movdqu 3*16 + $offset($inp), $D3
pxor $A3, $A
pxor $B3, $B
pxor $C3, $C
pxor $D, $D3
movdqu $A, 0*16 + $offset($oup)
movdqu $B, 1*16 + $offset($oup)
movdqu $C, 2*16 + $offset($oup)
movdqu $D3, 3*16 + $offset($oup)\n";
}
sub xor_stream_using_temp {
my ($A, $B, $C, $D, $offset, $temp)=@_;
$code.="movdqa $temp, $tmp_store
movdqu 0*16 + $offset($inp), $temp
pxor $A, $temp
movdqu $temp, 0*16 + $offset($oup)
movdqu 1*16 + $offset($inp), $temp
pxor $B, $temp
movdqu $temp, 1*16 + $offset($oup)
movdqu 2*16 + $offset($inp), $temp
pxor $C, $temp
movdqu $temp, 2*16 + $offset($oup)
movdqu 3*16 + $offset($inp), $temp
pxor $D, $temp
movdqu $temp, 3*16 + $offset($oup)\n";
}
sub gen_chacha_round {
my ($rot1, $rot2, $shift)=@_;
my $round="";
$round.="movdqa $C0, $tmp_store\n" if ($rot1 eq 20);
$round.="movdqa $rot2, $C0
paddd $B3, $A3
paddd $B2, $A2
paddd $B1, $A1
paddd $B0, $A0
pxor $A3, $D3
pxor $A2, $D2
pxor $A1, $D1
pxor $A0, $D0
pshufb $C0, $D3
pshufb $C0, $D2
pshufb $C0, $D1
pshufb $C0, $D0
movdqa $tmp_store, $C0
paddd $D3, $C3
paddd $D2, $C2
paddd $D1, $C1
paddd $D0, $C0
pxor $C3, $B3
pxor $C2, $B2
pxor $C1, $B1
pxor $C0, $B0
movdqa $C0, $tmp_store
movdqa $B3, $C0
psrld \$$rot1, $C0
pslld \$32-$rot1, $B3
pxor $C0, $B3
movdqa $B2, $C0
psrld \$$rot1, $C0
pslld \$32-$rot1, $B2
pxor $C0, $B2
movdqa $B1, $C0
psrld \$$rot1, $C0
pslld \$32-$rot1, $B1
pxor $C0, $B1
movdqa $B0, $C0
psrld \$$rot1, $C0
pslld \$32-$rot1, $B0
pxor $C0, $B0\n";
($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/);
($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/);
$round.="movdqa $tmp_store, $C0
palignr \$$s1, $B3, $B3
palignr \$$s2, $C3, $C3
palignr \$$s3, $D3, $D3
palignr \$$s1, $B2, $B2
palignr \$$s2, $C2, $C2
palignr \$$s3, $D2, $D2
palignr \$$s1, $B1, $B1
palignr \$$s2, $C1, $C1
palignr \$$s3, $D1, $D1
palignr \$$s1, $B0, $B0
palignr \$$s2, $C0, $C0
palignr \$$s3, $D0, $D0\n"
if (($shift =~ /left/) || ($shift =~ /right/));
return $round;
};
$chacha_body = &gen_chacha_round(20, ".Lrol16(%rip)") .
&gen_chacha_round(25, ".Lrol8(%rip)", "left") .
&gen_chacha_round(20, ".Lrol16(%rip)") .
&gen_chacha_round(25, ".Lrol8(%rip)", "right");
my @loop_body = split /\n/, $chacha_body;
sub emit_body {
my ($n)=@_;
for (my $i=0; $i < $n; $i++) {
$code=$code.shift(@loop_body)."\n";
};
}
{
################################################################################
# void poly_hash_ad_internal();
$code.="
.type poly_hash_ad_internal,\@abi-omnipotent
.align 64
poly_hash_ad_internal:
.cfi_startproc
.cfi_def_cfa rsp, 8
xor $acc0, $acc0
xor $acc1, $acc1
xor $acc2, $acc2
cmp \$13, $itr2
jne .Lhash_ad_loop
.Lpoly_fast_tls_ad:
# Special treatment for the TLS case of 13 bytes
mov ($adp), $acc0
mov 5($adp), $acc1
shr \$24, $acc1
mov \$1, $acc2\n";
&poly_mul(); $code.="
ret
.Lhash_ad_loop:
# Hash in 16 byte chunk
cmp \$16, $itr2
jb .Lhash_ad_tail\n";
&poly_add("0($adp)");
&poly_mul(); $code.="
lea 1*16($adp), $adp
sub \$16, $itr2
jmp .Lhash_ad_loop
.Lhash_ad_tail:
cmp \$0, $itr2
je .Lhash_ad_done
# Hash last < 16 byte tail
xor $t0, $t0
xor $t1, $t1
xor $t2, $t2
add $itr2, $adp
.Lhash_ad_tail_loop:
shld \$8, $t0, $t1
shl \$8, $t0
movzxb -1($adp), $t2
xor $t2, $t0
dec $adp
dec $itr2
jne .Lhash_ad_tail_loop
add $t0, $acc0
adc $t1, $acc1
adc \$1, $acc2\n";
&poly_mul(); $code.="
# Finished AD
.Lhash_ad_done:
ret
.cfi_endproc
.size poly_hash_ad_internal, .-poly_hash_ad_internal\n";
}
{
################################################################################
# void chacha20_poly1305_open(uint8_t *out_plaintext, const uint8_t *ciphertext,
# size_t plaintext_len, const uint8_t *ad,
# size_t ad_len,
# union chacha20_poly1305_open_data *aead_data)
#
$code.="
.globl chacha20_poly1305_open
.type chacha20_poly1305_open,\@function,6
.align 64
chacha20_poly1305_open:
.cfi_startproc
_CET_ENDBR
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
# We write the calculated authenticator back to keyp at the end, so save
# the pointer on the stack too.
push $keyp
.cfi_push $keyp
sub \$288 + $xmm_storage + 32, %rsp
.cfi_adjust_cfa_offset 288 + 32
lea 32(%rsp), %rbp
and \$-32, %rbp\n";
$code.="
movaps %xmm6,16*0+$xmm_store
movaps %xmm7,16*1+$xmm_store
movaps %xmm8,16*2+$xmm_store
movaps %xmm9,16*3+$xmm_store
movaps %xmm10,16*4+$xmm_store
movaps %xmm11,16*5+$xmm_store
movaps %xmm12,16*6+$xmm_store
movaps %xmm13,16*7+$xmm_store
movaps %xmm14,16*8+$xmm_store
movaps %xmm15,16*9+$xmm_store\n" if ($win64);
$code.="
mov %rdx, $inl
mov $adl, 0+$len_store
mov $inl, 8+$len_store\n";
$code.="
mov OPENSSL_ia32cap_P+8(%rip), %eax
and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present
xor \$`(1<<5) + (1<<8)`, %eax
jz chacha20_poly1305_open_avx2\n" if ($avx>1);
$code.="
cmp \$128, $inl
jbe .Lopen_sse_128
# For long buffers, prepare the poly key first
movdqa .Lchacha20_consts(%rip), $A0
movdqu 0*16($keyp), $B0
movdqu 1*16($keyp), $C0
movdqu 2*16($keyp), $D0
movdqa $D0, $T1
# Store on stack, to free keyp
movdqa $B0, $state1_store
movdqa $C0, $state2_store
movdqa $D0, $ctr0_store
mov \$10, $acc0
.Lopen_sse_init_rounds:\n";
&chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
&chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
dec $acc0
jne .Lopen_sse_init_rounds
# A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
paddd .Lchacha20_consts(%rip), $A0
paddd $state1_store, $B0
# Clamp and store the key
pand .Lclamp(%rip), $A0
movdqa $A0, $r_store
movdqa $B0, $s_store
# Hash
mov $adl, $itr2
call poly_hash_ad_internal
.Lopen_sse_main_loop:
cmp \$16*16, $inl
jb .Lopen_sse_tail
# Load state, increment counter blocks\n";
&prep_state(4); $code.="
# There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we
# hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
mov \$4, $itr1
mov $inp, $itr2
.Lopen_sse_main_loop_rounds:\n";
&emit_body(20);
&poly_add("0($itr2)"); $code.="
lea 2*8($itr2), $itr2\n";
&emit_body(20);
&poly_stage1();
&emit_body(20);
&poly_stage2();
&emit_body(20);
&poly_stage3();
&emit_body(20);
&poly_reduce_stage();
foreach $l (@loop_body) {$code.=$l."\n";}
@loop_body = split /\n/, $chacha_body; $code.="
dec $itr1
jge .Lopen_sse_main_loop_rounds\n";
&poly_add("0($itr2)");
&poly_mul(); $code.="
lea 2*8($itr2), $itr2
cmp \$-6, $itr1
jg .Lopen_sse_main_loop_rounds\n";
&finalize_state(4);
&xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
&xor_stream($A2, $B2, $C2, $D2, "4*16");
&xor_stream($A1, $B1, $C1, $D1, "8*16");
&xor_stream($A0, $B0, $C0, $tmp_store, "12*16"); $code.="
lea 16*16($inp), $inp
lea 16*16($oup), $oup
sub \$16*16, $inl
jmp .Lopen_sse_main_loop
.Lopen_sse_tail:
# Handle the various tail sizes efficiently
test $inl, $inl
jz .Lopen_sse_finalize
cmp \$12*16, $inl
ja .Lopen_sse_tail_256
cmp \$8*16, $inl
ja .Lopen_sse_tail_192
cmp \$4*16, $inl
ja .Lopen_sse_tail_128\n";
###############################################################################
# At most 64 bytes are left
&prep_state(1); $code.="
xor $itr2, $itr2
mov $inl, $itr1
cmp \$16, $itr1
jb .Lopen_sse_tail_64_rounds
.Lopen_sse_tail_64_rounds_and_x1hash: \n";
&poly_add("0($inp,$itr2)");
&poly_mul(); $code.="
sub \$16, $itr1
.Lopen_sse_tail_64_rounds:
add \$16, $itr2\n";
&chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
&chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
cmp \$16, $itr1
jae .Lopen_sse_tail_64_rounds_and_x1hash
cmp \$10*16, $itr2
jne .Lopen_sse_tail_64_rounds\n";
&finalize_state(1); $code.="
jmp .Lopen_sse_tail_64_dec_loop
###############################################################################
.Lopen_sse_tail_128:\n";
# 65 - 128 bytes are left
&prep_state(2); $code.="
mov $inl, $itr1
and \$-16, $itr1
xor $itr2, $itr2
.Lopen_sse_tail_128_rounds_and_x1hash: \n";
&poly_add("0($inp,$itr2)");
&poly_mul(); $code.="
.Lopen_sse_tail_128_rounds:
add \$16, $itr2\n";
&chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
&chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
&chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
&chacha_qr($A1,$B1,$C1,$D1,$T0,"right");$code.="
cmp $itr1, $itr2
jb .Lopen_sse_tail_128_rounds_and_x1hash
cmp \$10*16, $itr2
jne .Lopen_sse_tail_128_rounds\n";
&finalize_state(2);
&xor_stream($A1, $B1, $C1, $D1, "0*16"); $code.="
sub \$4*16, $inl
lea 4*16($inp), $inp
lea 4*16($oup), $oup
jmp .Lopen_sse_tail_64_dec_loop
###############################################################################
.Lopen_sse_tail_192:\n";
# 129 - 192 bytes are left
&prep_state(3); $code.="
mov $inl, $itr1
mov \$10*16, $itr2
cmp \$10*16, $itr1
cmovg $itr2, $itr1
and \$-16, $itr1
xor $itr2, $itr2
.Lopen_sse_tail_192_rounds_and_x1hash: \n";
&poly_add("0($inp,$itr2)");
&poly_mul(); $code.="
.Lopen_sse_tail_192_rounds:
add \$16, $itr2\n";
&chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
&chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
&chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
&chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
&chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
&chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
cmp $itr1, $itr2
jb .Lopen_sse_tail_192_rounds_and_x1hash
cmp \$10*16, $itr2
jne .Lopen_sse_tail_192_rounds
cmp \$11*16, $inl
jb .Lopen_sse_tail_192_finish\n";
&poly_add("10*16($inp)");
&poly_mul(); $code.="
cmp \$12*16, $inl
jb .Lopen_sse_tail_192_finish\n";
&poly_add("11*16($inp)");
&poly_mul(); $code.="
.Lopen_sse_tail_192_finish: \n";
&finalize_state(3);
&xor_stream($A2, $B2, $C2, $D2, "0*16");
&xor_stream($A1, $B1, $C1, $D1, "4*16"); $code.="
sub \$8*16, $inl
lea 8*16($inp), $inp
lea 8*16($oup), $oup
jmp .Lopen_sse_tail_64_dec_loop
###############################################################################
.Lopen_sse_tail_256:\n";
# 193 - 255 bytes are left
&prep_state(4); $code.="
xor $itr2, $itr2
.Lopen_sse_tail_256_rounds_and_x1hash: \n";
&poly_add("0($inp,$itr2)");
&chacha_qr($A0,$B0,$C0,$D0,$C3,"store_left");
&chacha_qr($A1,$B1,$C1,$D1,$C3,"left");
&chacha_qr($A2,$B2,$C2,$D2,$C3,"left_load");
&poly_stage1();
&chacha_qr($A3,$B3,$C3,$D3,$C1,"store_left_load");
&poly_stage2();
&chacha_qr($A0,$B0,$C0,$D0,$C3,"store_right");
&chacha_qr($A1,$B1,$C1,$D1,$C3,"right");
&poly_stage3();
&chacha_qr($A2,$B2,$C2,$D2,$C3,"right_load");
&poly_reduce_stage();
&chacha_qr($A3,$B3,$C3,$D3,$C1,"store_right_load"); $code.="
add \$16, $itr2
cmp \$10*16, $itr2
jb .Lopen_sse_tail_256_rounds_and_x1hash
mov $inl, $itr1
and \$-16, $itr1
.Lopen_sse_tail_256_hash: \n";
&poly_add("0($inp,$itr2)");
&poly_mul(); $code.="
add \$16, $itr2
cmp $itr1, $itr2
jb .Lopen_sse_tail_256_hash\n";
&finalize_state(4);
&xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
&xor_stream($A2, $B2, $C2, $D2, "4*16");
&xor_stream($A1, $B1, $C1, $D1, "8*16"); $code.="
movdqa $tmp_store, $D0
sub \$12*16, $inl
lea 12*16($inp), $inp
lea 12*16($oup), $oup
###############################################################################
# Decrypt the remaining data, 16B at a time, using existing stream
.Lopen_sse_tail_64_dec_loop:
cmp \$16, $inl
jb .Lopen_sse_tail_16_init
sub \$16, $inl
movdqu ($inp), $T0
pxor $T0, $A0
movdqu $A0, ($oup)
lea 16($inp), $inp
lea 16($oup), $oup
movdqa $B0, $A0
movdqa $C0, $B0
movdqa $D0, $C0
jmp .Lopen_sse_tail_64_dec_loop
.Lopen_sse_tail_16_init:
movdqa $A0, $A1
# Decrypt up to 16 bytes at the end.
.Lopen_sse_tail_16:
test $inl, $inl
jz .Lopen_sse_finalize
# Read the final bytes into $T0. They need to be read in reverse order so
# that they end up in the correct order in $T0.
pxor $T0, $T0
lea -1($inp,$inl), $inp
movq $inl, $itr2
.Lopen_sse_tail_16_compose:
pslldq \$1, $T0
pinsrb \$0, ($inp), $T0
sub \$1, $inp
sub \$1, $itr2
jnz .Lopen_sse_tail_16_compose
movq $T0, $t0
pextrq \$1, $T0, $t1
# The final bytes of keystream are in $A1.
pxor $A1, $T0
# Copy the plaintext bytes out.
.Lopen_sse_tail_16_extract:
pextrb \$0, $T0, ($oup)
psrldq \$1, $T0
add \$1, $oup
sub \$1, $inl
jne .Lopen_sse_tail_16_extract
add $t0, $acc0
adc $t1, $acc1
adc \$1, $acc2\n";
&poly_mul(); $code.="
.Lopen_sse_finalize:\n";
&poly_add($len_store);
&poly_mul(); $code.="
# Final reduce
mov $acc0, $t0
mov $acc1, $t1
mov $acc2, $t2
sub \$-5, $acc0
sbb \$-1, $acc1
sbb \$3, $acc2
cmovc $t0, $acc0
cmovc $t1, $acc1
cmovc $t2, $acc2
# Add in s part of the key
add 0+$s_store, $acc0
adc 8+$s_store, $acc1\n";
$code.="
movaps 16*0+$xmm_store, %xmm6
movaps 16*1+$xmm_store, %xmm7
movaps 16*2+$xmm_store, %xmm8
movaps 16*3+$xmm_store, %xmm9
movaps 16*4+$xmm_store, %xmm10
movaps 16*5+$xmm_store, %xmm11
movaps 16*6+$xmm_store, %xmm12
movaps 16*7+$xmm_store, %xmm13
movaps 16*8+$xmm_store, %xmm14
movaps 16*9+$xmm_store, %xmm15\n" if ($win64);
$code.="
.cfi_remember_state
add \$288 + $xmm_storage + 32, %rsp
.cfi_adjust_cfa_offset -(288 + 32)
# The tag replaces the key on return
pop $keyp
.cfi_pop $keyp
mov $acc0, ($keyp)
mov $acc1, 8($keyp)
pop %r15
.cfi_pop %r15
pop %r14
.cfi_pop %r14
pop %r13
.cfi_pop %r13
pop %r12
.cfi_pop %r12
pop %rbx
.cfi_pop %rbx
pop %rbp
.cfi_pop %rbp
ret
###############################################################################
.Lopen_sse_128:
.cfi_restore_state
movdqu .Lchacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
movdqu 2*16($keyp), $D0
movdqa $D0, $D1\npaddd .Lsse_inc(%rip), $D1
movdqa $D1, $D2\npaddd .Lsse_inc(%rip), $D2
movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D1, $T3
mov \$10, $acc0
.Lopen_sse_128_rounds: \n";
&chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
&chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
&chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
&chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
&chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
&chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
dec $acc0
jnz .Lopen_sse_128_rounds
paddd .Lchacha20_consts(%rip), $A0
paddd .Lchacha20_consts(%rip), $A1
paddd .Lchacha20_consts(%rip), $A2
paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
paddd $T2, $C1\npaddd $T2, $C2
paddd $T3, $D1
paddd .Lsse_inc(%rip), $T3
paddd $T3, $D2
# Clamp and store the key
pand .Lclamp(%rip), $A0
movdqa $A0, $r_store
movdqa $B0, $s_store
# Hash
mov $adl, $itr2
call poly_hash_ad_internal
.Lopen_sse_128_xor_hash:
cmp \$16, $inl
jb .Lopen_sse_tail_16
sub \$16, $inl\n";
# Load for hashing
&poly_add("0*8($inp)"); $code.="
# Load for decryption
movdqu 0*16($inp), $T0
pxor $T0, $A1
movdqu $A1, 0*16($oup)
lea 1*16($inp), $inp
lea 1*16($oup), $oup\n";
&poly_mul(); $code.="
# Shift the stream left
movdqa $B1, $A1
movdqa $C1, $B1
movdqa $D1, $C1
movdqa $A2, $D1
movdqa $B2, $A2
movdqa $C2, $B2
movdqa $D2, $C2
jmp .Lopen_sse_128_xor_hash
.size chacha20_poly1305_open, .-chacha20_poly1305_open
.cfi_endproc
################################################################################
################################################################################
# void chacha20_poly1305_seal(uint8_t *out_ciphertext, const uint8_t *plaintext,
# size_t plaintext_len, const uint8_t *ad,
# size_t ad_len,
# union chacha20_poly1305_seal_data *data);
.globl chacha20_poly1305_seal
.type chacha20_poly1305_seal,\@function,6
.align 64
chacha20_poly1305_seal:
.cfi_startproc
_CET_ENDBR
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
# We write the calculated authenticator back to keyp at the end, so save
# the pointer on the stack too.
push $keyp
.cfi_push $keyp
sub \$288 + $xmm_storage + 32, %rsp
.cfi_adjust_cfa_offset 288 + 32
lea 32(%rsp), %rbp
and \$-32, %rbp\n";
$code.="
movaps %xmm6,16*0+$xmm_store
movaps %xmm7,16*1+$xmm_store
movaps %xmm8,16*2+$xmm_store
movaps %xmm9,16*3+$xmm_store
movaps %xmm10,16*4+$xmm_store
movaps %xmm11,16*5+$xmm_store
movaps %xmm12,16*6+$xmm_store
movaps %xmm13,16*7+$xmm_store
movaps %xmm14,16*8+$xmm_store
movaps %xmm15,16*9+$xmm_store\n" if ($win64);
$code.="
mov 56($keyp), $inl # extra_in_len
addq %rdx, $inl
mov $adl, 0+$len_store
mov $inl, 8+$len_store
mov %rdx, $inl\n";
$code.="
mov OPENSSL_ia32cap_P+8(%rip), %eax
and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present
xor \$`(1<<5) + (1<<8)`, %eax
jz chacha20_poly1305_seal_avx2\n" if ($avx>1);
$code.="
cmp \$128, $inl
jbe .Lseal_sse_128
# For longer buffers, prepare the poly key + some stream
movdqa .Lchacha20_consts(%rip), $A0
movdqu 0*16($keyp), $B0
movdqu 1*16($keyp), $C0
movdqu 2*16($keyp), $D0
movdqa $A0, $A1
movdqa $A0, $A2
movdqa $A0, $A3
movdqa $B0, $B1
movdqa $B0, $B2
movdqa $B0, $B3
movdqa $C0, $C1
movdqa $C0, $C2
movdqa $C0, $C3
movdqa $D0, $D3
paddd .Lsse_inc(%rip), $D0
movdqa $D0, $D2
paddd .Lsse_inc(%rip), $D0
movdqa $D0, $D1
paddd .Lsse_inc(%rip), $D0
# Store on stack
movdqa $B0, $state1_store
movdqa $C0, $state2_store
movdqa $D0, $ctr0_store
movdqa $D1, $ctr1_store
movdqa $D2, $ctr2_store
movdqa $D3, $ctr3_store
mov \$10, $acc0
.Lseal_sse_init_rounds: \n";
foreach $l (@loop_body) {$code.=$l."\n";}
@loop_body = split /\n/, $chacha_body; $code.="
dec $acc0
jnz .Lseal_sse_init_rounds\n";
&finalize_state(4); $code.="
# Clamp and store the key
pand .Lclamp(%rip), $A3
movdqa $A3, $r_store
movdqa $B3, $s_store
# Hash
mov $adl, $itr2
call poly_hash_ad_internal\n";
&xor_stream($A2,$B2,$C2,$D2,"0*16");
&xor_stream($A1,$B1,$C1,$D1,"4*16"); $code.="
cmp \$12*16, $inl
ja .Lseal_sse_main_init
mov \$8*16, $itr1
sub \$8*16, $inl
lea 8*16($inp), $inp
jmp .Lseal_sse_128_tail_hash
.Lseal_sse_main_init:\n";
&xor_stream($A0, $B0, $C0, $D0, "8*16"); $code.="
mov \$12*16, $itr1
sub \$12*16, $inl
lea 12*16($inp), $inp
mov \$2, $itr1
mov \$8, $itr2
cmp \$4*16, $inl
jbe .Lseal_sse_tail_64
cmp \$8*16, $inl
jbe .Lseal_sse_tail_128
cmp \$12*16, $inl
jbe .Lseal_sse_tail_192
.Lseal_sse_main_loop: \n";
# The main loop
&prep_state(4); $code.="
.align 32
.Lseal_sse_main_rounds: \n";
&emit_body(20);
&poly_add("0($oup)");
&emit_body(20);
&poly_stage1();
&emit_body(20);
&poly_stage2();
&emit_body(20);
&poly_stage3();
&emit_body(20);
&poly_reduce_stage();
foreach $l (@loop_body) {$code.=$l."\n";}
@loop_body = split /\n/, $chacha_body; $code.="
lea 16($oup), $oup
dec $itr2
jge .Lseal_sse_main_rounds\n";
&poly_add("0*8($oup)");
&poly_mul(); $code.="
lea 16($oup), $oup
dec $itr1
jg .Lseal_sse_main_rounds\n";
&finalize_state(4);$code.="
movdqa $D2, $tmp_store\n";
&xor_stream_using_temp($A3,$B3,$C3,$D3,0*16,$D2); $code.="
movdqa $tmp_store, $D2\n";
&xor_stream($A2,$B2,$C2,$D2, 4*16);
&xor_stream($A1,$B1,$C1,$D1, 8*16); $code.="
cmp \$16*16, $inl
ja .Lseal_sse_main_loop_xor
mov \$12*16, $itr1
sub \$12*16, $inl
lea 12*16($inp), $inp
jmp .Lseal_sse_128_tail_hash
.Lseal_sse_main_loop_xor: \n";
&xor_stream($A0,$B0,$C0,$D0,"12*16"); $code.="
lea 16*16($inp), $inp
sub \$16*16, $inl
mov \$6, $itr1
mov \$4, $itr2
cmp \$12*16, $inl
jg .Lseal_sse_main_loop
mov $inl, $itr1
test $inl, $inl
je .Lseal_sse_128_tail_hash
mov \$6, $itr1
cmp \$8*16, $inl
ja .Lseal_sse_tail_192
cmp \$4*16, $inl
ja .Lseal_sse_tail_128
###############################################################################
.Lseal_sse_tail_64: \n";
&prep_state(1); $code.="
.Lseal_sse_tail_64_rounds_and_x2hash: \n";
&poly_add("0($oup)");
&poly_mul(); $code.="
lea 16($oup), $oup
.Lseal_sse_tail_64_rounds_and_x1hash: \n";
&chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
&chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
&poly_add("0($oup)");
&poly_mul(); $code.="
lea 16($oup), $oup
dec $itr1
jg .Lseal_sse_tail_64_rounds_and_x2hash
dec $itr2
jge .Lseal_sse_tail_64_rounds_and_x1hash\n";
&finalize_state(1); $code.="
jmp .Lseal_sse_128_tail_xor
###############################################################################
.Lseal_sse_tail_128:\n";
&prep_state(2); $code.="
.Lseal_sse_tail_128_rounds_and_x2hash: \n";
&poly_add("0($oup)");
&poly_mul(); $code.="
lea 16($oup), $oup
.Lseal_sse_tail_128_rounds_and_x1hash: \n";
&chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
&chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
&poly_add("0($oup)");
&poly_mul();
&chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
&chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); $code.="
lea 16($oup), $oup
dec $itr1
jg .Lseal_sse_tail_128_rounds_and_x2hash
dec $itr2
jge .Lseal_sse_tail_128_rounds_and_x1hash\n";
&finalize_state(2);
&xor_stream($A1,$B1,$C1,$D1,0*16); $code.="
mov \$4*16, $itr1
sub \$4*16, $inl
lea 4*16($inp), $inp
jmp .Lseal_sse_128_tail_hash
###############################################################################
.Lseal_sse_tail_192:\n";
&prep_state(3); $code.="
.Lseal_sse_tail_192_rounds_and_x2hash: \n";
&poly_add("0($oup)");
&poly_mul(); $code.="
lea 16($oup), $oup
.Lseal_sse_tail_192_rounds_and_x1hash: \n";
&chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
&chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
&chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
&poly_add("0($oup)");
&poly_mul();
&chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
&chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
&chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
lea 16($oup), $oup
dec $itr1
jg .Lseal_sse_tail_192_rounds_and_x2hash
dec $itr2
jge .Lseal_sse_tail_192_rounds_and_x1hash\n";
&finalize_state(3);
&xor_stream($A2,$B2,$C2,$D2,0*16);
&xor_stream($A1,$B1,$C1,$D1,4*16); $code.="
mov \$8*16, $itr1
sub \$8*16, $inl
lea 8*16($inp), $inp
###############################################################################
.Lseal_sse_128_tail_hash:
cmp \$16, $itr1
jb .Lseal_sse_128_tail_xor\n";
&poly_add("0($oup)");
&poly_mul(); $code.="
sub \$16, $itr1
lea 16($oup), $oup
jmp .Lseal_sse_128_tail_hash
.Lseal_sse_128_tail_xor:
cmp \$16, $inl
jb .Lseal_sse_tail_16
sub \$16, $inl
# Load for decryption
movdqu 0*16($inp), $T0
pxor $T0, $A0
movdqu $A0, 0*16($oup)
# Then hash
add 0*8($oup), $acc0
adc 1*8($oup), $acc1
adc \$1, $acc2
lea 1*16($inp), $inp
lea 1*16($oup), $oup\n";
&poly_mul(); $code.="
# Shift the stream left
movdqa $B0, $A0
movdqa $C0, $B0
movdqa $D0, $C0
movdqa $A1, $D0
movdqa $B1, $A1
movdqa $C1, $B1
movdqa $D1, $C1
jmp .Lseal_sse_128_tail_xor
.Lseal_sse_tail_16:
test $inl, $inl
jz .Lprocess_blocks_of_extra_in
# We can only load the PT one byte at a time to avoid buffer overread
mov $inl, $itr2
mov $inl, $itr1
lea -1($inp,$inl), $inp
pxor $T3, $T3
.Lseal_sse_tail_16_compose:
pslldq \$1, $T3
pinsrb \$0, ($inp), $T3
lea -1($inp), $inp
dec $itr1
jne .Lseal_sse_tail_16_compose
# XOR the keystream with the plaintext.
pxor $A0, $T3
# Write ciphertext out, byte-by-byte.
movq $inl, $itr1
movdqu $T3, $A0
.Lseal_sse_tail_16_extract:
pextrb \$0, $A0, ($oup)
psrldq \$1, $A0
add \$1, $oup
sub \$1, $itr1
jnz .Lseal_sse_tail_16_extract
# $T3 contains the final (partial, non-empty) block of ciphertext which
# needs to be fed into the Poly1305 state. The right-most $inl bytes of it
# are valid. We need to fill it with extra_in bytes until full, or until we
# run out of bytes.
#
# $keyp points to the tag output, which is actually a struct with the
# extra_in pointer and length at offset 48.
movq 288 + $xmm_storage + 32(%rsp), $keyp
movq 56($keyp), $t1 # extra_in_len
movq 48($keyp), $t0 # extra_in
test $t1, $t1
jz .Lprocess_partial_block # Common case: no bytes of extra_in
movq \$16, $t2
subq $inl, $t2 # 16-$inl is the number of bytes that fit into $T3.
cmpq $t2, $t1 # if extra_in_len < 16-$inl, only copy extra_in_len
# (note that AT&T syntax reverses the arguments)
jge .Lload_extra_in
movq $t1, $t2
.Lload_extra_in:
# $t2 contains the number of bytes of extra_in (pointed to by $t0) to load
# into $T3. They are loaded in reverse order.
leaq -1($t0,$t2), $inp
# Update extra_in and extra_in_len to reflect the bytes that are about to
# be read.
addq $t2, $t0
subq $t2, $t1
movq $t0, 48($keyp)
movq $t1, 56($keyp)
# Update $itr2, which is used to select the mask later on, to reflect the
# extra bytes about to be added.
addq $t2, $itr2
# Load $t2 bytes of extra_in into $T2.
pxor $T2, $T2
.Lload_extra_load_loop:
pslldq \$1, $T2
pinsrb \$0, ($inp), $T2
lea -1($inp), $inp
sub \$1, $t2
jnz .Lload_extra_load_loop
# Shift $T2 up the length of the remainder from the main encryption. Sadly,
# the shift for an XMM register has to be a constant, thus we loop to do
# this.
movq $inl, $t2
.Lload_extra_shift_loop:
pslldq \$1, $T2
sub \$1, $t2
jnz .Lload_extra_shift_loop
# Mask $T3 (the remainder from the main encryption) so that superfluous
# bytes are zero. This means that the non-zero bytes in $T2 and $T3 are
# disjoint and so we can merge them with an OR.
lea .Land_masks(%rip), $t2
shl \$4, $inl
pand -16($t2,$inl), $T3
# Merge $T2 into $T3, forming the remainder block.
por $T2, $T3
# The block of ciphertext + extra_in is ready to be included in the
# Poly1305 state.
movq $T3, $t0
pextrq \$1, $T3, $t1
add $t0, $acc0
adc $t1, $acc1
adc \$1, $acc2\n";
&poly_mul(); $code.="
.Lprocess_blocks_of_extra_in:
# There may be additional bytes of extra_in to process.
movq 288+32+$xmm_storage (%rsp), $keyp
movq 48($keyp), $inp # extra_in
movq 56($keyp), $itr2 # extra_in_len
movq $itr2, $itr1
shr \$4, $itr2 # number of blocks
.Lprocess_extra_hash_loop:
jz process_extra_in_trailer\n";
&poly_add("0($inp)");
&poly_mul(); $code.="
leaq 16($inp), $inp
subq \$1, $itr2
jmp .Lprocess_extra_hash_loop
process_extra_in_trailer:
andq \$15, $itr1 # remaining num bytes (<16) of extra_in
movq $itr1, $inl
jz .Ldo_length_block
leaq -1($inp,$itr1), $inp
.Lprocess_extra_in_trailer_load:
pslldq \$1, $T3
pinsrb \$0, ($inp), $T3
lea -1($inp), $inp
sub \$1, $itr1
jnz .Lprocess_extra_in_trailer_load
.Lprocess_partial_block:
# $T3 contains $inl bytes of data to be fed into Poly1305. $inl != 0
lea .Land_masks(%rip), $t2
shl \$4, $inl
pand -16($t2,$inl), $T3
movq $T3, $t0
pextrq \$1, $T3, $t1
add $t0, $acc0
adc $t1, $acc1
adc \$1, $acc2\n";
&poly_mul(); $code.="
.Ldo_length_block:\n";
&poly_add($len_store);
&poly_mul(); $code.="
# Final reduce
mov $acc0, $t0
mov $acc1, $t1
mov $acc2, $t2
sub \$-5, $acc0
sbb \$-1, $acc1
sbb \$3, $acc2
cmovc $t0, $acc0
cmovc $t1, $acc1
cmovc $t2, $acc2
# Add in s part of the key
add 0+$s_store, $acc0
adc 8+$s_store, $acc1\n";
$code.="
movaps 16*0+$xmm_store, %xmm6
movaps 16*1+$xmm_store, %xmm7
movaps 16*2+$xmm_store, %xmm8
movaps 16*3+$xmm_store, %xmm9
movaps 16*4+$xmm_store, %xmm10
movaps 16*5+$xmm_store, %xmm11
movaps 16*6+$xmm_store, %xmm12
movaps 16*7+$xmm_store, %xmm13
movaps 16*8+$xmm_store, %xmm14
movaps 16*9+$xmm_store, %xmm15\n" if ($win64);
$code.="
.cfi_remember_state
add \$288 + $xmm_storage + 32, %rsp
.cfi_adjust_cfa_offset -(288 + 32)
# The tag replaces the key on return
pop $keyp
.cfi_pop $keyp
mov $acc0, ($keyp)
mov $acc1, 8($keyp)
pop %r15
.cfi_pop %r15
pop %r14
.cfi_pop %r14
pop %r13
.cfi_pop %r13
pop %r12
.cfi_pop %r12
pop %rbx
.cfi_pop %rbx
pop %rbp
.cfi_pop %rbp
ret
################################################################################
.Lseal_sse_128:
.cfi_restore_state
movdqu .Lchacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
movdqu 2*16($keyp), $D2
movdqa $D2, $D0\npaddd .Lsse_inc(%rip), $D0
movdqa $D0, $D1\npaddd .Lsse_inc(%rip), $D1
movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D0, $T3
mov \$10, $acc0
.Lseal_sse_128_rounds:\n";
&chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
&chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
&chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
&chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
&chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
&chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
dec $acc0
jnz .Lseal_sse_128_rounds
paddd .Lchacha20_consts(%rip), $A0
paddd .Lchacha20_consts(%rip), $A1
paddd .Lchacha20_consts(%rip), $A2
paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
paddd $T2, $C0\npaddd $T2, $C1
paddd $T3, $D0
paddd .Lsse_inc(%rip), $T3
paddd $T3, $D1
# Clamp and store the key
pand .Lclamp(%rip), $A2
movdqa $A2, $r_store
movdqa $B2, $s_store
# Hash
mov %r8, $itr2
call poly_hash_ad_internal
jmp .Lseal_sse_128_tail_xor
.size chacha20_poly1305_seal, .-chacha20_poly1305_seal
.cfi_endproc\n";
}
if ($avx>1) {
($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%ymm$_",(0..15));
my ($A0x,$A1x,$A2x,$A3x,$B0x,$B1x,$B2x,$B3x,$C0x,$C1x,$C2x,$C3x,$D0x,$D1x,$D2x,$D3x)=map("%xmm$_",(0..15));
($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3);
$state1_store="$xmm_storage+2*32(%rbp)";
$state2_store="$xmm_storage+3*32(%rbp)";
$tmp_store="$xmm_storage+4*32(%rbp)";
$ctr0_store="$xmm_storage+5*32(%rbp)";
$ctr1_store="$xmm_storage+6*32(%rbp)";
$ctr2_store="$xmm_storage+7*32(%rbp)";
$ctr3_store="$xmm_storage+8*32(%rbp)";
sub chacha_qr_avx2 {
my ($a,$b,$c,$d,$t,$dir)=@_;
$code.=<<___ if ($dir =~ /store/);
vmovdqa $t, $tmp_store
___
$code.=<<___;
vpaddd $b, $a, $a
vpxor $a, $d, $d
vpshufb .Lrol16(%rip), $d, $d
vpaddd $d, $c, $c
vpxor $c, $b, $b
vpsrld \$20, $b, $t
vpslld \$12, $b, $b
vpxor $t, $b, $b
vpaddd $b, $a, $a
vpxor $a, $d, $d
vpshufb .Lrol8(%rip), $d, $d
vpaddd $d, $c, $c
vpxor $c, $b, $b
vpslld \$7, $b, $t
vpsrld \$25, $b, $b
vpxor $t, $b, $b
___
$code.=<<___ if ($dir =~ /left/);
vpalignr \$12, $d, $d, $d
vpalignr \$8, $c, $c, $c
vpalignr \$4, $b, $b, $b
___
$code.=<<___ if ($dir =~ /right/);
vpalignr \$4, $d, $d, $d
vpalignr \$8, $c, $c, $c
vpalignr \$12, $b, $b, $b
___
$code.=<<___ if ($dir =~ /load/);
vmovdqa $tmp_store, $t
___
}
sub prep_state_avx2 {
my ($n)=@_;
$code.=<<___;
vmovdqa .Lchacha20_consts(%rip), $A0
vmovdqa $state1_store, $B0
vmovdqa $state2_store, $C0
___
$code.=<<___ if ($n ge 2);
vmovdqa $A0, $A1
vmovdqa $B0, $B1
vmovdqa $C0, $C1
___
$code.=<<___ if ($n ge 3);
vmovdqa $A0, $A2
vmovdqa $B0, $B2
vmovdqa $C0, $C2
___
$code.=<<___ if ($n ge 4);
vmovdqa $A0, $A3
vmovdqa $B0, $B3
vmovdqa $C0, $C3
___
$code.=<<___ if ($n eq 1);
vmovdqa .Lavx2_inc(%rip), $D0
vpaddd $ctr0_store, $D0, $D0
vmovdqa $D0, $ctr0_store
___
$code.=<<___ if ($n eq 2);
vmovdqa .Lavx2_inc(%rip), $D0
vpaddd $ctr0_store, $D0, $D1
vpaddd $D1, $D0, $D0
vmovdqa $D0, $ctr0_store
vmovdqa $D1, $ctr1_store
___
$code.=<<___ if ($n eq 3);
vmovdqa .Lavx2_inc(%rip), $D0
vpaddd $ctr0_store, $D0, $D2
vpaddd $D2, $D0, $D1
vpaddd $D1, $D0, $D0
vmovdqa $D0, $ctr0_store
vmovdqa $D1, $ctr1_store
vmovdqa $D2, $ctr2_store
___
$code.=<<___ if ($n eq 4);
vmovdqa .Lavx2_inc(%rip), $D0
vpaddd $ctr0_store, $D0, $D3
vpaddd $D3, $D0, $D2
vpaddd $D2, $D0, $D1
vpaddd $D1, $D0, $D0
vmovdqa $D3, $ctr3_store
vmovdqa $D2, $ctr2_store
vmovdqa $D1, $ctr1_store
vmovdqa $D0, $ctr0_store
___
}
sub finalize_state_avx2 {
my ($n)=@_;
$code.=<<___ if ($n eq 4);
vpaddd .Lchacha20_consts(%rip), $A3, $A3
vpaddd $state1_store, $B3, $B3
vpaddd $state2_store, $C3, $C3
vpaddd $ctr3_store, $D3, $D3
___
$code.=<<___ if ($n ge 3);
vpaddd .Lchacha20_consts(%rip), $A2, $A2
vpaddd $state1_store, $B2, $B2
vpaddd $state2_store, $C2, $C2
vpaddd $ctr2_store, $D2, $D2
___
$code.=<<___ if ($n ge 2);
vpaddd .Lchacha20_consts(%rip), $A1, $A1
vpaddd $state1_store, $B1, $B1
vpaddd $state2_store, $C1, $C1
vpaddd $ctr1_store, $D1, $D1
___
$code.=<<___;
vpaddd .Lchacha20_consts(%rip), $A0, $A0
vpaddd $state1_store, $B0, $B0
vpaddd $state2_store, $C0, $C0
vpaddd $ctr0_store, $D0, $D0
___
}
sub xor_stream_avx2 {
my ($A, $B, $C, $D, $offset, $hlp)=@_;
$code.=<<___;
vperm2i128 \$0x02, $A, $B, $hlp
vperm2i128 \$0x13, $A, $B, $B
vperm2i128 \$0x02, $C, $D, $A
vperm2i128 \$0x13, $C, $D, $C
vpxor 0*32+$offset($inp), $hlp, $hlp
vpxor 1*32+$offset($inp), $A, $A
vpxor 2*32+$offset($inp), $B, $B
vpxor 3*32+$offset($inp), $C, $C
vmovdqu $hlp, 0*32+$offset($oup)
vmovdqu $A, 1*32+$offset($oup)
vmovdqu $B, 2*32+$offset($oup)
vmovdqu $C, 3*32+$offset($oup)
___
}
sub finish_stream_avx2 {
my ($A, $B, $C, $D, $hlp)=@_;
$code.=<<___;
vperm2i128 \$0x13, $A, $B, $hlp
vperm2i128 \$0x02, $A, $B, $A
vperm2i128 \$0x02, $C, $D, $B
vperm2i128 \$0x13, $C, $D, $D
vmovdqa $hlp, $C
___
}
sub poly_stage1_mulx {
$code.=<<___;
mov 0+$r_store, %rdx
mov %rdx, $t2
mulx $acc0, $t0, $t1
mulx $acc1, %rax, %rdx
imulq $acc2, $t2
add %rax, $t1
adc %rdx, $t2
___
}
sub poly_stage2_mulx {
$code.=<<___;
mov 8+$r_store, %rdx
mulx $acc0, $acc0, %rax
add $acc0, $t1
mulx $acc1, $acc1, $t3
adc $acc1, $t2
adc \$0, $t3
imulq $acc2, %rdx
___
}
sub poly_stage3_mulx {
$code.=<<___;
add %rax, $t2
adc %rdx, $t3
___
}
sub poly_mul_mulx {
&poly_stage1_mulx();
&poly_stage2_mulx();
&poly_stage3_mulx();
&poly_reduce_stage();
}
sub gen_chacha_round_avx2 {
my ($rot1, $rot2, $shift)=@_;
my $round="";
$round=$round ."vmovdqa $C0, $tmp_store\n" if ($rot1 eq 20);
$round=$round ."vmovdqa $rot2, $C0
vpaddd $B3, $A3, $A3
vpaddd $B2, $A2, $A2
vpaddd $B1, $A1, $A1
vpaddd $B0, $A0, $A0
vpxor $A3, $D3, $D3
vpxor $A2, $D2, $D2
vpxor $A1, $D1, $D1
vpxor $A0, $D0, $D0
vpshufb $C0, $D3, $D3
vpshufb $C0, $D2, $D2
vpshufb $C0, $D1, $D1
vpshufb $C0, $D0, $D0
vpaddd $D3, $C3, $C3
vpaddd $D2, $C2, $C2
vpaddd $D1, $C1, $C1
vpaddd $tmp_store, $D0, $C0
vpxor $C3, $B3, $B3
vpxor $C2, $B2, $B2
vpxor $C1, $B1, $B1
vpxor $C0, $B0, $B0
vmovdqa $C0, $tmp_store
vpsrld \$$rot1, $B3, $C0
vpslld \$32-$rot1, $B3, $B3
vpxor $C0, $B3, $B3
vpsrld \$$rot1, $B2, $C0
vpslld \$32-$rot1, $B2, $B2
vpxor $C0, $B2, $B2
vpsrld \$$rot1, $B1, $C0
vpslld \$32-$rot1, $B1, $B1
vpxor $C0, $B1, $B1
vpsrld \$$rot1, $B0, $C0
vpslld \$32-$rot1, $B0, $B0
vpxor $C0, $B0, $B0\n";
($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/);
($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/);
$round=$round ."vmovdqa $tmp_store, $C0
vpalignr \$$s1, $B3, $B3, $B3
vpalignr \$$s2, $C3, $C3, $C3
vpalignr \$$s3, $D3, $D3, $D3
vpalignr \$$s1, $B2, $B2, $B2
vpalignr \$$s2, $C2, $C2, $C2
vpalignr \$$s3, $D2, $D2, $D2
vpalignr \$$s1, $B1, $B1, $B1
vpalignr \$$s2, $C1, $C1, $C1
vpalignr \$$s3, $D1, $D1, $D1
vpalignr \$$s1, $B0, $B0, $B0
vpalignr \$$s2, $C0, $C0, $C0
vpalignr \$$s3, $D0, $D0, $D0\n"
if (($shift =~ /left/) || ($shift =~ /right/));
return $round;
};
$chacha_body = &gen_chacha_round_avx2(20, ".Lrol16(%rip)") .
&gen_chacha_round_avx2(25, ".Lrol8(%rip)", "left") .
&gen_chacha_round_avx2(20, ".Lrol16(%rip)") .
&gen_chacha_round_avx2(25, ".Lrol8(%rip)", "right");
@loop_body = split /\n/, $chacha_body;
$code.="
###############################################################################
.type chacha20_poly1305_open_avx2,\@abi-omnipotent
.align 64
chacha20_poly1305_open_avx2:
.cfi_startproc
# Since the AVX2 function operates in the frame of the SSE function, we just copy the frame state to over here
.cfi_push %rbp
.cfi_push %rbx
.cfi_push %r12
.cfi_push %r13
.cfi_push %r14
.cfi_push %r15
.cfi_push $keyp
.cfi_adjust_cfa_offset 288 + 32
vzeroupper
vmovdqa .Lchacha20_consts(%rip), $A0
vbroadcasti128 0*16($keyp), $B0
vbroadcasti128 1*16($keyp), $C0
vbroadcasti128 2*16($keyp), $D0
vpaddd .Lavx2_init(%rip), $D0, $D0
cmp \$6*32, $inl
jbe .Lopen_avx2_192
cmp \$10*32, $inl
jbe .Lopen_avx2_320
vmovdqa $B0, $state1_store
vmovdqa $C0, $state2_store
vmovdqa $D0, $ctr0_store
mov \$10, $acc0
.Lopen_avx2_init_rounds: \n";
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
dec $acc0
jne .Lopen_avx2_init_rounds
vpaddd .Lchacha20_consts(%rip), $A0, $A0
vpaddd $state1_store, $B0, $B0
vpaddd $state2_store, $C0, $C0
vpaddd $ctr0_store, $D0, $D0
vperm2i128 \$0x02, $A0, $B0, $T0
# Clamp and store key
vpand .Lclamp(%rip), $T0, $T0
vmovdqa $T0, $r_store
# Stream for the first 64 bytes
vperm2i128 \$0x13, $A0, $B0, $A0
vperm2i128 \$0x13, $C0, $D0, $B0
# Hash AD + first 64 bytes
mov $adl, $itr2
call poly_hash_ad_internal
# Hash first 64 bytes
xor $itr1, $itr1
.Lopen_avx2_init_hash: \n";
&poly_add("0($inp,$itr1)");
&poly_mul(); $code.="
add \$16, $itr1
cmp \$2*32, $itr1
jne .Lopen_avx2_init_hash
# Decrypt first 64 bytes
vpxor 0*32($inp), $A0, $A0
vpxor 1*32($inp), $B0, $B0
# Store first 64 bytes of decrypted data
vmovdqu $A0, 0*32($oup)
vmovdqu $B0, 1*32($oup)
lea 2*32($inp), $inp
lea 2*32($oup), $oup
sub \$2*32, $inl
.Lopen_avx2_main_loop:
# Hash and decrypt 512 bytes each iteration
cmp \$16*32, $inl
jb .Lopen_avx2_main_loop_done\n";
&prep_state_avx2(4); $code.="
xor $itr1, $itr1
.Lopen_avx2_main_loop_rounds: \n";
&poly_add("0*8($inp,$itr1)");
&emit_body(10);
&poly_stage1_mulx();
&emit_body(9);
&poly_stage2_mulx();
&emit_body(12);
&poly_stage3_mulx();
&emit_body(10);
&poly_reduce_stage();
&emit_body(9);
&poly_add("2*8($inp,$itr1)");
&emit_body(8);
&poly_stage1_mulx();
&emit_body(18);
&poly_stage2_mulx();
&emit_body(18);
&poly_stage3_mulx();
&emit_body(9);
&poly_reduce_stage();
&emit_body(8);
&poly_add("4*8($inp,$itr1)"); $code.="
lea 6*8($itr1), $itr1\n";
&emit_body(18);
&poly_stage1_mulx();
&emit_body(8);
&poly_stage2_mulx();
&emit_body(8);
&poly_stage3_mulx();
&emit_body(18);
&poly_reduce_stage();
foreach $l (@loop_body) {$code.=$l."\n";}
@loop_body = split /\n/, $chacha_body; $code.="
cmp \$10*6*8, $itr1
jne .Lopen_avx2_main_loop_rounds\n";
&finalize_state_avx2(4); $code.="
vmovdqa $A0, $tmp_store\n";
&poly_add("10*6*8($inp)");
&xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
vmovdqa $tmp_store, $A0\n";
&poly_mul();
&xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
&poly_add("10*6*8+2*8($inp)");
&xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
&poly_mul();
&xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.="
lea 16*32($inp), $inp
lea 16*32($oup), $oup
sub \$16*32, $inl
jmp .Lopen_avx2_main_loop
.Lopen_avx2_main_loop_done:
test $inl, $inl
vzeroupper
je .Lopen_sse_finalize
cmp \$12*32, $inl
ja .Lopen_avx2_tail_512
cmp \$8*32, $inl
ja .Lopen_avx2_tail_384
cmp \$4*32, $inl
ja .Lopen_avx2_tail_256\n";
###############################################################################
# 1-128 bytes left
&prep_state_avx2(1); $code.="
xor $itr2, $itr2
mov $inl, $itr1
and \$-16, $itr1
test $itr1, $itr1
je .Lopen_avx2_tail_128_rounds # Have nothing to hash
.Lopen_avx2_tail_128_rounds_and_x1hash: \n";
&poly_add("0*8($inp,$itr2)");
&poly_mul(); $code.="
.Lopen_avx2_tail_128_rounds:
add \$16, $itr2\n";
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
cmp $itr1, $itr2
jb .Lopen_avx2_tail_128_rounds_and_x1hash
cmp \$160, $itr2
jne .Lopen_avx2_tail_128_rounds\n";
&finalize_state_avx2(1);
&finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
jmp .Lopen_avx2_tail_128_xor
###############################################################################
.Lopen_avx2_tail_256: \n";
# 129-256 bytes left
&prep_state_avx2(2); $code.="
mov $inl, $tmp_store
mov $inl, $itr1
sub \$4*32, $itr1
shr \$4, $itr1
mov \$10, $itr2
cmp \$10, $itr1
cmovg $itr2, $itr1
mov $inp, $inl
xor $itr2, $itr2
.Lopen_avx2_tail_256_rounds_and_x1hash: \n";
&poly_add("0*8($inl)");
&poly_mul_mulx(); $code.="
lea 16($inl), $inl
.Lopen_avx2_tail_256_rounds: \n";
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); $code.="
inc $itr2\n";
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
&chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
cmp $itr1, $itr2
jb .Lopen_avx2_tail_256_rounds_and_x1hash
cmp \$10, $itr2
jne .Lopen_avx2_tail_256_rounds
mov $inl, $itr2
sub $inp, $inl
mov $inl, $itr1
mov $tmp_store, $inl
.Lopen_avx2_tail_256_hash:
add \$16, $itr1
cmp $inl, $itr1
jg .Lopen_avx2_tail_256_done\n";
&poly_add("0*8($itr2)");
&poly_mul_mulx(); $code.="
lea 16($itr2), $itr2
jmp .Lopen_avx2_tail_256_hash
.Lopen_avx2_tail_256_done: \n";
&finalize_state_avx2(2);
&xor_stream_avx2($A1, $B1, $C1, $D1, 0*32, $T0);
&finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.="
lea 4*32($inp), $inp
lea 4*32($oup), $oup
sub \$4*32, $inl
jmp .Lopen_avx2_tail_128_xor
###############################################################################
.Lopen_avx2_tail_384: \n";
# 257-383 bytes left
&prep_state_avx2(3); $code.="
mov $inl, $tmp_store
mov $inl, $itr1
sub \$8*32, $itr1
shr \$4, $itr1
add \$6, $itr1
mov \$10, $itr2
cmp \$10, $itr1
cmovg $itr2, $itr1
mov $inp, $inl
xor $itr2, $itr2
.Lopen_avx2_tail_384_rounds_and_x2hash: \n";
&poly_add("0*8($inl)");
&poly_mul_mulx(); $code.="
lea 16($inl), $inl
.Lopen_avx2_tail_384_rounds_and_x1hash: \n";
&chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
&poly_add("0*8($inl)");
&poly_mul(); $code.="
lea 16($inl), $inl
inc $itr2\n";
&chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right");
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
cmp $itr1, $itr2
jb .Lopen_avx2_tail_384_rounds_and_x2hash
cmp \$10, $itr2
jne .Lopen_avx2_tail_384_rounds_and_x1hash
mov $inl, $itr2
sub $inp, $inl
mov $inl, $itr1
mov $tmp_store, $inl
.Lopen_avx2_384_tail_hash:
add \$16, $itr1
cmp $inl, $itr1
jg .Lopen_avx2_384_tail_done\n";
&poly_add("0*8($itr2)");
&poly_mul_mulx(); $code.="
lea 16($itr2), $itr2
jmp .Lopen_avx2_384_tail_hash
.Lopen_avx2_384_tail_done: \n";
&finalize_state_avx2(3);
&xor_stream_avx2($A2, $B2, $C2, $D2, 0*32, $T0);
&xor_stream_avx2($A1, $B1, $C1, $D1, 4*32, $T0);
&finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.="
lea 8*32($inp), $inp
lea 8*32($oup), $oup
sub \$8*32, $inl
jmp .Lopen_avx2_tail_128_xor
###############################################################################
.Lopen_avx2_tail_512: \n";
# 384-512 bytes left
&prep_state_avx2(4); $code.="
xor $itr1, $itr1
mov $inp, $itr2
.Lopen_avx2_tail_512_rounds_and_x2hash: \n";
&poly_add("0*8($itr2)");
&poly_mul(); $code.="
lea 2*8($itr2), $itr2
.Lopen_avx2_tail_512_rounds_and_x1hash: \n";
&emit_body(37);
&poly_add("0*8($itr2)");
&poly_mul_mulx();
&emit_body(48);
&poly_add("2*8($itr2)");
&poly_mul_mulx(); $code.="
lea 4*8($itr2), $itr2\n";
foreach $l (@loop_body) {$code.=$l."\n";}
@loop_body = split /\n/, $chacha_body; $code.="
inc $itr1
cmp \$4, $itr1
jl .Lopen_avx2_tail_512_rounds_and_x2hash
cmp \$10, $itr1
jne .Lopen_avx2_tail_512_rounds_and_x1hash
mov $inl, $itr1
sub \$12*32, $itr1
and \$-16, $itr1
.Lopen_avx2_tail_512_hash:
test $itr1, $itr1
je .Lopen_avx2_tail_512_done\n";
&poly_add("0*8($itr2)");
&poly_mul_mulx(); $code.="
lea 2*8($itr2), $itr2
sub \$2*8, $itr1
jmp .Lopen_avx2_tail_512_hash
.Lopen_avx2_tail_512_done: \n";
&finalize_state_avx2(4); $code.="
vmovdqa $A0, $tmp_store\n";
&xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
vmovdqa $tmp_store, $A0\n";
&xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
&xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
&finish_stream_avx2($A0, $B0, $C0, $D0, $A3); $code.="
lea 12*32($inp), $inp
lea 12*32($oup), $oup
sub \$12*32, $inl
.Lopen_avx2_tail_128_xor:
cmp \$32, $inl
jb .Lopen_avx2_tail_32_xor
sub \$32, $inl
vpxor ($inp), $A0, $A0
vmovdqu $A0, ($oup)
lea 1*32($inp), $inp
lea 1*32($oup), $oup
vmovdqa $B0, $A0
vmovdqa $C0, $B0
vmovdqa $D0, $C0
jmp .Lopen_avx2_tail_128_xor
.Lopen_avx2_tail_32_xor:
cmp \$16, $inl
vmovdqa $A0x, $A1x
jb .Lopen_avx2_exit
sub \$16, $inl
#load for decryption
vpxor ($inp), $A0x, $A1x
vmovdqu $A1x, ($oup)
lea 1*16($inp), $inp
lea 1*16($oup), $oup
vperm2i128 \$0x11, $A0, $A0, $A0
vmovdqa $A0x, $A1x
.Lopen_avx2_exit:
vzeroupper
jmp .Lopen_sse_tail_16
###############################################################################
.Lopen_avx2_192:
vmovdqa $A0, $A1
vmovdqa $A0, $A2
vmovdqa $B0, $B1
vmovdqa $B0, $B2
vmovdqa $C0, $C1
vmovdqa $C0, $C2
vpaddd .Lavx2_inc(%rip), $D0, $D1
vmovdqa $D0, $T2
vmovdqa $D1, $T3
mov \$10, $acc0
.Lopen_avx2_192_rounds: \n";
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
dec $acc0
jne .Lopen_avx2_192_rounds
vpaddd $A2, $A0, $A0
vpaddd $A2, $A1, $A1
vpaddd $B2, $B0, $B0
vpaddd $B2, $B1, $B1
vpaddd $C2, $C0, $C0
vpaddd $C2, $C1, $C1
vpaddd $T2, $D0, $D0
vpaddd $T3, $D1, $D1
vperm2i128 \$0x02, $A0, $B0, $T0
# Clamp and store the key
vpand .Lclamp(%rip), $T0, $T0
vmovdqa $T0, $r_store
# Stream for up to 192 bytes
vperm2i128 \$0x13, $A0, $B0, $A0
vperm2i128 \$0x13, $C0, $D0, $B0
vperm2i128 \$0x02, $A1, $B1, $C0
vperm2i128 \$0x02, $C1, $D1, $D0
vperm2i128 \$0x13, $A1, $B1, $A1
vperm2i128 \$0x13, $C1, $D1, $B1
.Lopen_avx2_short:
mov $adl, $itr2
call poly_hash_ad_internal
.Lopen_avx2_short_hash_and_xor_loop:
cmp \$32, $inl
jb .Lopen_avx2_short_tail_32
sub \$32, $inl\n";
# Load + hash
&poly_add("0*8($inp)");
&poly_mul();
&poly_add("2*8($inp)");
&poly_mul(); $code.="
# Load + decrypt
vpxor ($inp), $A0, $A0
vmovdqu $A0, ($oup)
lea 1*32($inp), $inp
lea 1*32($oup), $oup
# Shift stream
vmovdqa $B0, $A0
vmovdqa $C0, $B0
vmovdqa $D0, $C0
vmovdqa $A1, $D0
vmovdqa $B1, $A1
vmovdqa $C1, $B1
vmovdqa $D1, $C1
vmovdqa $A2, $D1
vmovdqa $B2, $A2
jmp .Lopen_avx2_short_hash_and_xor_loop
.Lopen_avx2_short_tail_32:
cmp \$16, $inl
vmovdqa $A0x, $A1x
jb .Lopen_avx2_short_tail_32_exit
sub \$16, $inl\n";
&poly_add("0*8($inp)");
&poly_mul(); $code.="
vpxor ($inp), $A0x, $A3x
vmovdqu $A3x, ($oup)
lea 1*16($inp), $inp
lea 1*16($oup), $oup
vextracti128 \$1, $A0, $A1x
.Lopen_avx2_short_tail_32_exit:
vzeroupper
jmp .Lopen_sse_tail_16
###############################################################################
.Lopen_avx2_320:
vmovdqa $A0, $A1
vmovdqa $A0, $A2
vmovdqa $B0, $B1
vmovdqa $B0, $B2
vmovdqa $C0, $C1
vmovdqa $C0, $C2
vpaddd .Lavx2_inc(%rip), $D0, $D1
vpaddd .Lavx2_inc(%rip), $D1, $D2
vmovdqa $B0, $T1
vmovdqa $C0, $T2
vmovdqa $D0, $ctr0_store
vmovdqa $D1, $ctr1_store
vmovdqa $D2, $ctr2_store
mov \$10, $acc0
.Lopen_avx2_320_rounds: \n";
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
&chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
&chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
dec $acc0
jne .Lopen_avx2_320_rounds
vpaddd .Lchacha20_consts(%rip), $A0, $A0
vpaddd .Lchacha20_consts(%rip), $A1, $A1
vpaddd .Lchacha20_consts(%rip), $A2, $A2
vpaddd $T1, $B0, $B0
vpaddd $T1, $B1, $B1
vpaddd $T1, $B2, $B2
vpaddd $T2, $C0, $C0
vpaddd $T2, $C1, $C1
vpaddd $T2, $C2, $C2
vpaddd $ctr0_store, $D0, $D0
vpaddd $ctr1_store, $D1, $D1
vpaddd $ctr2_store, $D2, $D2
vperm2i128 \$0x02, $A0, $B0, $T0
# Clamp and store the key
vpand .Lclamp(%rip), $T0, $T0
vmovdqa $T0, $r_store
# Stream for up to 320 bytes
vperm2i128 \$0x13, $A0, $B0, $A0
vperm2i128 \$0x13, $C0, $D0, $B0
vperm2i128 \$0x02, $A1, $B1, $C0
vperm2i128 \$0x02, $C1, $D1, $D0
vperm2i128 \$0x13, $A1, $B1, $A1
vperm2i128 \$0x13, $C1, $D1, $B1
vperm2i128 \$0x02, $A2, $B2, $C1
vperm2i128 \$0x02, $C2, $D2, $D1
vperm2i128 \$0x13, $A2, $B2, $A2
vperm2i128 \$0x13, $C2, $D2, $B2
jmp .Lopen_avx2_short
.size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2
.cfi_endproc
###############################################################################
###############################################################################
.type chacha20_poly1305_seal_avx2,\@abi-omnipotent
.align 64
chacha20_poly1305_seal_avx2:
.cfi_startproc
# Since the AVX2 function operates in the frame of the SSE function, we just copy the frame state to over here
.cfi_push %rbp
.cfi_push %rbx
.cfi_push %r12
.cfi_push %r13
.cfi_push %r14
.cfi_push %r15
.cfi_push $keyp
.cfi_adjust_cfa_offset 288 + 32
vzeroupper
vmovdqa .Lchacha20_consts(%rip), $A0
vbroadcasti128 0*16($keyp), $B0
vbroadcasti128 1*16($keyp), $C0
vbroadcasti128 2*16($keyp), $D0
vpaddd .Lavx2_init(%rip), $D0, $D0
cmp \$6*32, $inl
jbe .Lseal_avx2_192
cmp \$10*32, $inl
jbe .Lseal_avx2_320
vmovdqa $A0, $A1
vmovdqa $A0, $A2
vmovdqa $A0, $A3
vmovdqa $B0, $B1
vmovdqa $B0, $B2
vmovdqa $B0, $B3
vmovdqa $B0, $state1_store
vmovdqa $C0, $C1
vmovdqa $C0, $C2
vmovdqa $C0, $C3
vmovdqa $C0, $state2_store
vmovdqa $D0, $D3
vpaddd .Lavx2_inc(%rip), $D3, $D2
vpaddd .Lavx2_inc(%rip), $D2, $D1
vpaddd .Lavx2_inc(%rip), $D1, $D0
vmovdqa $D0, $ctr0_store
vmovdqa $D1, $ctr1_store
vmovdqa $D2, $ctr2_store
vmovdqa $D3, $ctr3_store
mov \$10, $acc0
.Lseal_avx2_init_rounds: \n";
foreach $l (@loop_body) {$code.=$l."\n";}
@loop_body = split /\n/, $chacha_body; $code.="
dec $acc0
jnz .Lseal_avx2_init_rounds\n";
&finalize_state_avx2(4); $code.="
vperm2i128 \$0x13, $C3, $D3, $C3
vperm2i128 \$0x02, $A3, $B3, $D3
vperm2i128 \$0x13, $A3, $B3, $A3
vpand .Lclamp(%rip), $D3, $D3
vmovdqa $D3, $r_store
mov $adl, $itr2
call poly_hash_ad_internal
# Safely store 320 bytes (otherwise would handle with optimized call)
vpxor 0*32($inp), $A3, $A3
vpxor 1*32($inp), $C3, $C3
vmovdqu $A3, 0*32($oup)
vmovdqu $C3, 1*32($oup)\n";
&xor_stream_avx2($A2,$B2,$C2,$D2,2*32,$T3);
&xor_stream_avx2($A1,$B1,$C1,$D1,6*32,$T3);
&finish_stream_avx2($A0,$B0,$C0,$D0,$T3); $code.="
lea 10*32($inp), $inp
sub \$10*32, $inl
mov \$10*32, $itr1
cmp \$4*32, $inl
jbe .Lseal_avx2_short_hash_remainder
vpxor 0*32($inp), $A0, $A0
vpxor 1*32($inp), $B0, $B0
vpxor 2*32($inp), $C0, $C0
vpxor 3*32($inp), $D0, $D0
vmovdqu $A0, 10*32($oup)
vmovdqu $B0, 11*32($oup)
vmovdqu $C0, 12*32($oup)
vmovdqu $D0, 13*32($oup)
lea 4*32($inp), $inp
sub \$4*32, $inl
mov \$8, $itr1
mov \$2, $itr2
cmp \$4*32, $inl
jbe .Lseal_avx2_tail_128
cmp \$8*32, $inl
jbe .Lseal_avx2_tail_256
cmp \$12*32, $inl
jbe .Lseal_avx2_tail_384
cmp \$16*32, $inl
jbe .Lseal_avx2_tail_512\n";
# We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
&prep_state_avx2(4);
foreach $l (@loop_body) {$code.=$l."\n";}
@loop_body = split /\n/, $chacha_body;
&emit_body(41);
@loop_body = split /\n/, $chacha_body; $code.="
sub \$16, $oup
mov \$9, $itr1
jmp .Lseal_avx2_main_loop_rounds_entry
.align 32
.Lseal_avx2_main_loop: \n";
&prep_state_avx2(4); $code.="
mov \$10, $itr1
.align 32
.Lseal_avx2_main_loop_rounds: \n";
&poly_add("0*8($oup)");
&emit_body(10);
&poly_stage1_mulx();
&emit_body(9);
&poly_stage2_mulx();
&emit_body(12);
&poly_stage3_mulx();
&emit_body(10);
&poly_reduce_stage(); $code.="
.Lseal_avx2_main_loop_rounds_entry: \n";
&emit_body(9);
&poly_add("2*8($oup)");
&emit_body(8);
&poly_stage1_mulx();
&emit_body(18);
&poly_stage2_mulx();
&emit_body(18);
&poly_stage3_mulx();
&emit_body(9);
&poly_reduce_stage();
&emit_body(8);
&poly_add("4*8($oup)"); $code.="
lea 6*8($oup), $oup\n";
&emit_body(18);
&poly_stage1_mulx();
&emit_body(8);
&poly_stage2_mulx();
&emit_body(8);
&poly_stage3_mulx();
&emit_body(18);
&poly_reduce_stage();
foreach $l (@loop_body) {$code.=$l."\n";}
@loop_body = split /\n/, $chacha_body; $code.="
dec $itr1
jne .Lseal_avx2_main_loop_rounds\n";
&finalize_state_avx2(4); $code.="
vmovdqa $A0, $tmp_store\n";
&poly_add("0*8($oup)");
&poly_mul_mulx();
&poly_add("2*8($oup)");
&poly_mul_mulx(); $code.="
lea 4*8($oup), $oup\n";
&xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
vmovdqa $tmp_store, $A0\n";
&xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
&xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
&xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.="
lea 16*32($inp), $inp
sub \$16*32, $inl
cmp \$16*32, $inl
jg .Lseal_avx2_main_loop
\n";
&poly_add("0*8($oup)");
&poly_mul_mulx();
&poly_add("2*8($oup)");
&poly_mul_mulx(); $code.="
lea 4*8($oup), $oup
mov \$10, $itr1
xor $itr2, $itr2
cmp \$12*32, $inl
ja .Lseal_avx2_tail_512
cmp \$8*32, $inl
ja .Lseal_avx2_tail_384
cmp \$4*32, $inl
ja .Lseal_avx2_tail_256
###############################################################################
.Lseal_avx2_tail_128:\n";
&prep_state_avx2(1); $code.="
.Lseal_avx2_tail_128_rounds_and_3xhash: \n";
&poly_add("0($oup)");
&poly_mul_mulx(); $code.="
lea 2*8($oup), $oup
.Lseal_avx2_tail_128_rounds_and_2xhash: \n";
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
&poly_add("0*8($oup)");
&poly_mul_mulx();
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
&poly_add("2*8($oup)");
&poly_mul_mulx(); $code.="
lea 4*8($oup), $oup
dec $itr1
jg .Lseal_avx2_tail_128_rounds_and_3xhash
dec $itr2
jge .Lseal_avx2_tail_128_rounds_and_2xhash\n";
&finalize_state_avx2(1);
&finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
jmp .Lseal_avx2_short_loop
###############################################################################
.Lseal_avx2_tail_256:\n";
&prep_state_avx2(2); $code.="
.Lseal_avx2_tail_256_rounds_and_3xhash: \n";
&poly_add("0($oup)");
&poly_mul(); $code.="
lea 2*8($oup), $oup
.Lseal_avx2_tail_256_rounds_and_2xhash: \n";
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
&poly_add("0*8($oup)");
&poly_mul();
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
&poly_add("2*8($oup)");
&poly_mul(); $code.="
lea 4*8($oup), $oup
dec $itr1
jg .Lseal_avx2_tail_256_rounds_and_3xhash
dec $itr2
jge .Lseal_avx2_tail_256_rounds_and_2xhash\n";
&finalize_state_avx2(2);
&xor_stream_avx2($A1,$B1,$C1,$D1,0*32,$T0);
&finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
mov \$4*32, $itr1
lea 4*32($inp), $inp
sub \$4*32, $inl
jmp .Lseal_avx2_short_hash_remainder
###############################################################################
.Lseal_avx2_tail_384:\n";
&prep_state_avx2(3); $code.="
.Lseal_avx2_tail_384_rounds_and_3xhash: \n";
&poly_add("0($oup)");
&poly_mul(); $code.="
lea 2*8($oup), $oup
.Lseal_avx2_tail_384_rounds_and_2xhash: \n";
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
&poly_add("0*8($oup)");
&poly_mul();
&chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
&poly_add("2*8($oup)");
&poly_mul();
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
&chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
lea 4*8($oup), $oup
dec $itr1
jg .Lseal_avx2_tail_384_rounds_and_3xhash
dec $itr2
jge .Lseal_avx2_tail_384_rounds_and_2xhash\n";
&finalize_state_avx2(3);
&xor_stream_avx2($A2,$B2,$C2,$D2,0*32,$T0);
&xor_stream_avx2($A1,$B1,$C1,$D1,4*32,$T0);
&finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
mov \$8*32, $itr1
lea 8*32($inp), $inp
sub \$8*32, $inl
jmp .Lseal_avx2_short_hash_remainder
###############################################################################
.Lseal_avx2_tail_512:\n";
&prep_state_avx2(4); $code.="
.Lseal_avx2_tail_512_rounds_and_3xhash: \n";
&poly_add("0($oup)");
&poly_mul_mulx(); $code.="
lea 2*8($oup), $oup
.Lseal_avx2_tail_512_rounds_and_2xhash: \n";
&emit_body(20);
&poly_add("0*8($oup)");
&emit_body(20);
&poly_stage1_mulx();
&emit_body(20);
&poly_stage2_mulx();
&emit_body(20);
&poly_stage3_mulx();
&emit_body(20);
&poly_reduce_stage();
&emit_body(20);
&poly_add("2*8($oup)");
&emit_body(20);
&poly_stage1_mulx();
&emit_body(20);
&poly_stage2_mulx();
&emit_body(20);
&poly_stage3_mulx();
&emit_body(20);
&poly_reduce_stage();
foreach $l (@loop_body) {$code.=$l."\n";}
@loop_body = split /\n/, $chacha_body; $code.="
lea 4*8($oup), $oup
dec $itr1
jg .Lseal_avx2_tail_512_rounds_and_3xhash
dec $itr2
jge .Lseal_avx2_tail_512_rounds_and_2xhash\n";
&finalize_state_avx2(4); $code.="
vmovdqa $A0, $tmp_store\n";
&xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
vmovdqa $tmp_store, $A0\n";
&xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
&xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
&finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
mov \$12*32, $itr1
lea 12*32($inp), $inp
sub \$12*32, $inl
jmp .Lseal_avx2_short_hash_remainder
################################################################################
.Lseal_avx2_320:
vmovdqa $A0, $A1
vmovdqa $A0, $A2
vmovdqa $B0, $B1
vmovdqa $B0, $B2
vmovdqa $C0, $C1
vmovdqa $C0, $C2
vpaddd .Lavx2_inc(%rip), $D0, $D1
vpaddd .Lavx2_inc(%rip), $D1, $D2
vmovdqa $B0, $T1
vmovdqa $C0, $T2
vmovdqa $D0, $ctr0_store
vmovdqa $D1, $ctr1_store
vmovdqa $D2, $ctr2_store
mov \$10, $acc0
.Lseal_avx2_320_rounds: \n";
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
&chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
&chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
dec $acc0
jne .Lseal_avx2_320_rounds
vpaddd .Lchacha20_consts(%rip), $A0, $A0
vpaddd .Lchacha20_consts(%rip), $A1, $A1
vpaddd .Lchacha20_consts(%rip), $A2, $A2
vpaddd $T1, $B0, $B0
vpaddd $T1, $B1, $B1
vpaddd $T1, $B2, $B2
vpaddd $T2, $C0, $C0
vpaddd $T2, $C1, $C1
vpaddd $T2, $C2, $C2
vpaddd $ctr0_store, $D0, $D0
vpaddd $ctr1_store, $D1, $D1
vpaddd $ctr2_store, $D2, $D2
vperm2i128 \$0x02, $A0, $B0, $T0
# Clamp and store the key
vpand .Lclamp(%rip), $T0, $T0
vmovdqa $T0, $r_store
# Stream for up to 320 bytes
vperm2i128 \$0x13, $A0, $B0, $A0
vperm2i128 \$0x13, $C0, $D0, $B0
vperm2i128 \$0x02, $A1, $B1, $C0
vperm2i128 \$0x02, $C1, $D1, $D0
vperm2i128 \$0x13, $A1, $B1, $A1
vperm2i128 \$0x13, $C1, $D1, $B1
vperm2i128 \$0x02, $A2, $B2, $C1
vperm2i128 \$0x02, $C2, $D2, $D1
vperm2i128 \$0x13, $A2, $B2, $A2
vperm2i128 \$0x13, $C2, $D2, $B2
jmp .Lseal_avx2_short
################################################################################
.Lseal_avx2_192:
vmovdqa $A0, $A1
vmovdqa $A0, $A2
vmovdqa $B0, $B1
vmovdqa $B0, $B2
vmovdqa $C0, $C1
vmovdqa $C0, $C2
vpaddd .Lavx2_inc(%rip), $D0, $D1
vmovdqa $D0, $T2
vmovdqa $D1, $T3
mov \$10, $acc0
.Lseal_avx2_192_rounds: \n";
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
dec $acc0
jne .Lseal_avx2_192_rounds
vpaddd $A2, $A0, $A0
vpaddd $A2, $A1, $A1
vpaddd $B2, $B0, $B0
vpaddd $B2, $B1, $B1
vpaddd $C2, $C0, $C0
vpaddd $C2, $C1, $C1
vpaddd $T2, $D0, $D0
vpaddd $T3, $D1, $D1
vperm2i128 \$0x02, $A0, $B0, $T0
# Clamp and store the key
vpand .Lclamp(%rip), $T0, $T0
vmovdqa $T0, $r_store
# Stream for up to 192 bytes
vperm2i128 \$0x13, $A0, $B0, $A0
vperm2i128 \$0x13, $C0, $D0, $B0
vperm2i128 \$0x02, $A1, $B1, $C0
vperm2i128 \$0x02, $C1, $D1, $D0
vperm2i128 \$0x13, $A1, $B1, $A1
vperm2i128 \$0x13, $C1, $D1, $B1
.Lseal_avx2_short:
mov $adl, $itr2
call poly_hash_ad_internal
xor $itr1, $itr1
.Lseal_avx2_short_hash_remainder:
cmp \$16, $itr1
jb .Lseal_avx2_short_loop\n";
&poly_add("0($oup)");
&poly_mul(); $code.="
sub \$16, $itr1
add \$16, $oup
jmp .Lseal_avx2_short_hash_remainder
.Lseal_avx2_short_loop:
cmp \$32, $inl
jb .Lseal_avx2_short_tail
sub \$32, $inl
# Encrypt
vpxor ($inp), $A0, $A0
vmovdqu $A0, ($oup)
lea 1*32($inp), $inp
# Load + hash\n";
&poly_add("0*8($oup)");
&poly_mul();
&poly_add("2*8($oup)");
&poly_mul(); $code.="
lea 1*32($oup), $oup
# Shift stream
vmovdqa $B0, $A0
vmovdqa $C0, $B0
vmovdqa $D0, $C0
vmovdqa $A1, $D0
vmovdqa $B1, $A1
vmovdqa $C1, $B1
vmovdqa $D1, $C1
vmovdqa $A2, $D1
vmovdqa $B2, $A2
jmp .Lseal_avx2_short_loop
.Lseal_avx2_short_tail:
cmp \$16, $inl
jb .Lseal_avx2_exit
sub \$16, $inl
vpxor ($inp), $A0x, $A3x
vmovdqu $A3x, ($oup)
lea 1*16($inp), $inp\n";
&poly_add("0*8($oup)");
&poly_mul(); $code.="
lea 1*16($oup), $oup
vextracti128 \$1, $A0, $A0x
.Lseal_avx2_exit:
vzeroupper
jmp .Lseal_sse_tail_16
.cfi_endproc
.size chacha20_poly1305_seal_avx2, .-chacha20_poly1305_seal_avx2
";
}
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT or die "error closing STDOUT: $!";