| #!/usr/bin/env perl |
| |
| # Copyright (c) 2017, Shay Gueron. |
| # Copyright (c) 2017, Google Inc. |
| # |
| # Permission to use, copy, modify, and/or distribute this software for any |
| # purpose with or without fee is hereby granted, provided that the above |
| # copyright notice and this permission notice appear in all copies. |
| # |
| # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
| # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
| # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY |
| # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
| # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION |
| # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN |
| # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ |
| |
| use warnings FATAL => 'all'; |
| |
| $flavour = shift; |
| $output = shift; |
| if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } |
| |
| $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); |
| |
| $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or |
| ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or |
| die "can't locate x86_64-xlate.pl"; |
| |
| open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; |
| *STDOUT=*OUT; |
| |
| $code.=<<___; |
| .data |
| |
| .align 16 |
| one: |
| .quad 1,0 |
| two: |
| .quad 2,0 |
| three: |
| .quad 3,0 |
| four: |
| .quad 4,0 |
| five: |
| .quad 5,0 |
| six: |
| .quad 6,0 |
| seven: |
| .quad 7,0 |
| eight: |
| .quad 8,0 |
| |
| OR_MASK: |
| .long 0x00000000,0x00000000,0x00000000,0x80000000 |
| poly: |
| .quad 0x1, 0xc200000000000000 |
| mask: |
| .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d |
| con1: |
| .long 1,1,1,1 |
| con2: |
| .long 0x1b,0x1b,0x1b,0x1b |
| con3: |
| .byte -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7 |
| and_mask: |
| .long 0,0xffffffff, 0xffffffff, 0xffffffff |
| ___ |
| |
| $code.=<<___; |
| .text |
| ___ |
| |
| sub gfmul { |
| ######################### |
| # a = T |
| # b = TMP0 - remains unchanged |
| # res = T |
| # uses also TMP1,TMP2,TMP3,TMP4 |
| # __m128i GFMUL(__m128i A, __m128i B); |
| |
| my $T = "%xmm0"; |
| my $TMP0 = "%xmm1"; |
| my $TMP1 = "%xmm2"; |
| my $TMP2 = "%xmm3"; |
| my $TMP3 = "%xmm4"; |
| my $TMP4 = "%xmm5"; |
| |
| $code.=<<___; |
| .type GFMUL,\@abi-omnipotent |
| .align 16 |
| GFMUL: |
| .cfi_startproc |
| vpclmulqdq \$0x00, $TMP0, $T, $TMP1 |
| vpclmulqdq \$0x11, $TMP0, $T, $TMP4 |
| vpclmulqdq \$0x10, $TMP0, $T, $TMP2 |
| vpclmulqdq \$0x01, $TMP0, $T, $TMP3 |
| vpxor $TMP3, $TMP2, $TMP2 |
| vpslldq \$8, $TMP2, $TMP3 |
| vpsrldq \$8, $TMP2, $TMP2 |
| vpxor $TMP3, $TMP1, $TMP1 |
| vpxor $TMP2, $TMP4, $TMP4 |
| |
| vpclmulqdq \$0x10, poly(%rip), $TMP1, $TMP2 |
| vpshufd \$78, $TMP1, $TMP3 |
| vpxor $TMP3, $TMP2, $TMP1 |
| |
| vpclmulqdq \$0x10, poly(%rip), $TMP1, $TMP2 |
| vpshufd \$78, $TMP1, $TMP3 |
| vpxor $TMP3, $TMP2, $TMP1 |
| |
| vpxor $TMP4, $TMP1, $T |
| ret |
| .cfi_endproc |
| .size GFMUL, .-GFMUL |
| ___ |
| } |
| gfmul(); |
| |
| sub aesgcmsiv_htable_init { |
| # aesgcmsiv_htable_init writes an eight-entry table of powers of |H| to |
| # |out_htable|. |
| # void aesgcmsiv_htable_init(uint8_t out_htable[16*8], uint8_t *H); |
| |
| my $Htbl = "%rdi"; |
| my $H = "%rsi"; |
| my $T = "%xmm0"; |
| my $TMP0 = "%xmm1"; |
| |
| $code.=<<___; |
| .globl aesgcmsiv_htable_init |
| .type aesgcmsiv_htable_init,\@function,2 |
| .align 16 |
| aesgcmsiv_htable_init: |
| .cfi_startproc |
| vmovdqa ($H), $T |
| vmovdqa $T, $TMP0 |
| vmovdqa $T, ($Htbl) # H |
| call GFMUL |
| vmovdqa $T, 16($Htbl) # H^2 |
| call GFMUL |
| vmovdqa $T, 32($Htbl) # H^3 |
| call GFMUL |
| vmovdqa $T, 48($Htbl) # H^4 |
| call GFMUL |
| vmovdqa $T, 64($Htbl) # H^5 |
| call GFMUL |
| vmovdqa $T, 80($Htbl) # H^6 |
| call GFMUL |
| vmovdqa $T, 96($Htbl) # H^7 |
| call GFMUL |
| vmovdqa $T, 112($Htbl) # H^8 |
| ret |
| .cfi_endproc |
| .size aesgcmsiv_htable_init, .-aesgcmsiv_htable_init |
| ___ |
| } |
| aesgcmsiv_htable_init(); |
| |
| sub aesgcmsiv_htable6_init { |
| # aesgcmsiv_htable6_init writes a six-entry table of powers of |H| to |
| # |out_htable|. |
| # void aesgcmsiv_htable6_init(uint8_t out_htable[16*6], uint8_t *H); |
| # |
| my $Htbl = "%rdi"; |
| my $H = "%rsi"; |
| my $T = "%xmm0"; |
| my $TMP0 = "%xmm1"; |
| |
| $code.=<<___; |
| .globl aesgcmsiv_htable6_init |
| .type aesgcmsiv_htable6_init,\@function,2 |
| .align 16 |
| aesgcmsiv_htable6_init: |
| .cfi_startproc |
| vmovdqa ($H), $T |
| vmovdqa $T, $TMP0 |
| vmovdqa $T, ($Htbl) # H |
| call GFMUL |
| vmovdqa $T, 16($Htbl) # H^2 |
| call GFMUL |
| vmovdqa $T, 32($Htbl) # H^3 |
| call GFMUL |
| vmovdqa $T, 48($Htbl) # H^4 |
| call GFMUL |
| vmovdqa $T, 64($Htbl) # H^5 |
| call GFMUL |
| vmovdqa $T, 80($Htbl) # H^6 |
| ret |
| .cfi_endproc |
| .size aesgcmsiv_htable6_init, .-aesgcmsiv_htable6_init |
| ___ |
| } |
| aesgcmsiv_htable6_init(); |
| |
| sub aesgcmsiv_htable_polyval { |
| # void aesgcmsiv_htable_polyval(uint8_t Htbl[16*8], uint8_t *MSG, uint64_t LEN, uint8_t *T); |
| # parameter 1: %rdi Htable - pointer to Htable |
| # parameter 2: %rsi INp - pointer to input |
| # parameter 3: %rdx LEN - length of BUFFER in bytes |
| # parameter 4: %rcx T - pointer to POLYVAL output |
| |
| my $DATA = "%xmm0"; |
| my $hlp0 = "%r11"; |
| my $Htbl = "%rdi"; |
| my $inp = "%rsi"; |
| my $len = "%rdx"; |
| my $TMP0 = "%xmm3"; |
| my $TMP1 = "%xmm4"; |
| my $TMP2 = "%xmm5"; |
| my $TMP3 = "%xmm6"; |
| my $TMP4 = "%xmm7"; |
| my $Tp = "%rcx"; |
| my $T = "%xmm1"; |
| my $Xhi = "%xmm9"; |
| |
| my $SCHOOLBOOK_AAD = sub { |
| my ($i)=@_; |
| return <<___; |
| vpclmulqdq \$0x01, ${\eval(16*$i)}($Htbl), $DATA, $TMP3 |
| vpxor $TMP3, $TMP2, $TMP2 |
| vpclmulqdq \$0x00, ${\eval(16*$i)}($Htbl), $DATA, $TMP3 |
| vpxor $TMP3, $TMP0, $TMP0 |
| vpclmulqdq \$0x11, ${\eval(16*$i)}($Htbl), $DATA, $TMP3 |
| vpxor $TMP3, $TMP1, $TMP1 |
| vpclmulqdq \$0x10, ${\eval(16*$i)}($Htbl), $DATA, $TMP3 |
| vpxor $TMP3, $TMP2, $TMP2 |
| ___ |
| }; |
| |
| $code.=<<___; |
| .globl aesgcmsiv_htable_polyval |
| .type aesgcmsiv_htable_polyval,\@function,4 |
| .align 16 |
| aesgcmsiv_htable_polyval: |
| .cfi_startproc |
| test $len, $len |
| jnz .Lhtable_polyval_start |
| ret |
| |
| .Lhtable_polyval_start: |
| vzeroall |
| |
| # We hash 8 blocks each iteration. If the total number of blocks is not a |
| # multiple of 8, we first hash the leading n%8 blocks. |
| movq $len, $hlp0 |
| andq \$127, $hlp0 |
| |
| jz .Lhtable_polyval_no_prefix |
| |
| vpxor $Xhi, $Xhi, $Xhi |
| vmovdqa ($Tp), $T |
| sub $hlp0, $len |
| |
| sub \$16, $hlp0 |
| |
| # hash first prefix block |
| vmovdqu ($inp), $DATA |
| vpxor $T, $DATA, $DATA |
| |
| vpclmulqdq \$0x01, ($Htbl,$hlp0), $DATA, $TMP2 |
| vpclmulqdq \$0x00, ($Htbl,$hlp0), $DATA, $TMP0 |
| vpclmulqdq \$0x11, ($Htbl,$hlp0), $DATA, $TMP1 |
| vpclmulqdq \$0x10, ($Htbl,$hlp0), $DATA, $TMP3 |
| vpxor $TMP3, $TMP2, $TMP2 |
| |
| lea 16($inp), $inp |
| test $hlp0, $hlp0 |
| jnz .Lhtable_polyval_prefix_loop |
| jmp .Lhtable_polyval_prefix_complete |
| |
| # hash remaining prefix bocks (up to 7 total prefix blocks) |
| .align 64 |
| .Lhtable_polyval_prefix_loop: |
| sub \$16, $hlp0 |
| |
| vmovdqu ($inp), $DATA # next data block |
| |
| vpclmulqdq \$0x00, ($Htbl,$hlp0), $DATA, $TMP3 |
| vpxor $TMP3, $TMP0, $TMP0 |
| vpclmulqdq \$0x11, ($Htbl,$hlp0), $DATA, $TMP3 |
| vpxor $TMP3, $TMP1, $TMP1 |
| vpclmulqdq \$0x01, ($Htbl,$hlp0), $DATA, $TMP3 |
| vpxor $TMP3, $TMP2, $TMP2 |
| vpclmulqdq \$0x10, ($Htbl,$hlp0), $DATA, $TMP3 |
| vpxor $TMP3, $TMP2, $TMP2 |
| |
| test $hlp0, $hlp0 |
| |
| lea 16($inp), $inp |
| |
| jnz .Lhtable_polyval_prefix_loop |
| |
| .Lhtable_polyval_prefix_complete: |
| vpsrldq \$8, $TMP2, $TMP3 |
| vpslldq \$8, $TMP2, $TMP2 |
| |
| vpxor $TMP3, $TMP1, $Xhi |
| vpxor $TMP2, $TMP0, $T |
| |
| jmp .Lhtable_polyval_main_loop |
| |
| .Lhtable_polyval_no_prefix: |
| # At this point we know the number of blocks is a multiple of 8. However, |
| # the reduction in the main loop includes a multiplication by x^(-128). In |
| # order to counter this, the existing tag needs to be multipled by x^128. |
| # In practice, this just means that it is loaded into $Xhi, not $T. |
| vpxor $T, $T, $T |
| vmovdqa ($Tp), $Xhi |
| |
| .align 64 |
| .Lhtable_polyval_main_loop: |
| sub \$0x80, $len |
| jb .Lhtable_polyval_out |
| |
| vmovdqu 16*7($inp), $DATA # Ii |
| |
| vpclmulqdq \$0x01, ($Htbl), $DATA, $TMP2 |
| vpclmulqdq \$0x00, ($Htbl), $DATA, $TMP0 |
| vpclmulqdq \$0x11, ($Htbl), $DATA, $TMP1 |
| vpclmulqdq \$0x10, ($Htbl), $DATA, $TMP3 |
| vpxor $TMP3, $TMP2, $TMP2 |
| |
| ######################################################### |
| vmovdqu 16*6($inp), $DATA |
| ${\$SCHOOLBOOK_AAD->(1)} |
| |
| ######################################################### |
| vmovdqu 16*5($inp), $DATA |
| |
| vpclmulqdq \$0x10, poly(%rip), $T, $TMP4 # reduction stage 1a |
| vpalignr \$8, $T, $T, $T |
| |
| ${\$SCHOOLBOOK_AAD->(2)} |
| |
| vpxor $TMP4, $T, $T # reduction stage 1b |
| ######################################################### |
| vmovdqu 16*4($inp), $DATA |
| |
| ${\$SCHOOLBOOK_AAD->(3)} |
| ######################################################### |
| vmovdqu 16*3($inp), $DATA |
| |
| vpclmulqdq \$0x10, poly(%rip), $T, $TMP4 # reduction stage 2a |
| vpalignr \$8, $T, $T, $T |
| |
| ${\$SCHOOLBOOK_AAD->(4)} |
| |
| vpxor $TMP4, $T, $T # reduction stage 2b |
| ######################################################### |
| vmovdqu 16*2($inp), $DATA |
| |
| ${\$SCHOOLBOOK_AAD->(5)} |
| |
| vpxor $Xhi, $T, $T # reduction finalize |
| ######################################################### |
| vmovdqu 16*1($inp), $DATA |
| |
| ${\$SCHOOLBOOK_AAD->(6)} |
| ######################################################### |
| vmovdqu 16*0($inp), $DATA |
| vpxor $T, $DATA, $DATA |
| |
| ${\$SCHOOLBOOK_AAD->(7)} |
| ######################################################### |
| vpsrldq \$8, $TMP2, $TMP3 |
| vpslldq \$8, $TMP2, $TMP2 |
| |
| vpxor $TMP3, $TMP1, $Xhi |
| vpxor $TMP2, $TMP0, $T |
| |
| lea 16*8($inp), $inp |
| jmp .Lhtable_polyval_main_loop |
| |
| ######################################################### |
| |
| .Lhtable_polyval_out: |
| vpclmulqdq \$0x10, poly(%rip), $T, $TMP3 |
| vpalignr \$8, $T, $T, $T |
| vpxor $TMP3, $T, $T |
| |
| vpclmulqdq \$0x10, poly(%rip), $T, $TMP3 |
| vpalignr \$8, $T, $T, $T |
| vpxor $TMP3, $T, $T |
| vpxor $Xhi, $T, $T |
| |
| vmovdqu $T, ($Tp) |
| vzeroupper |
| ret |
| .cfi_endproc |
| .size aesgcmsiv_htable_polyval,.-aesgcmsiv_htable_polyval |
| ___ |
| } |
| aesgcmsiv_htable_polyval(); |
| |
| sub aesgcmsiv_polyval_horner { |
| #void aesgcmsiv_polyval_horner(unsigned char T[16], // output |
| # const unsigned char* H, // H |
| # unsigned char* BUF, // Buffer |
| # unsigned int blocks); // Len2 |
| # |
| # parameter 1: %rdi T - pointers to POLYVAL output |
| # parameter 2: %rsi Hp - pointer to H (user key) |
| # parameter 3: %rdx INp - pointer to input |
| # parameter 4: %rcx L - total number of blocks in input BUFFER |
| # |
| my $T = "%rdi"; |
| my $Hp = "%rsi"; |
| my $INp = "%rdx"; |
| my $L = "%rcx"; |
| my $LOC = "%r10"; |
| my $LEN = "%eax"; |
| my $H = "%xmm1"; |
| my $RES = "%xmm0"; |
| |
| $code.=<<___; |
| .globl aesgcmsiv_polyval_horner |
| .type aesgcmsiv_polyval_horner,\@function,4 |
| .align 16 |
| aesgcmsiv_polyval_horner: |
| .cfi_startproc |
| test $L, $L |
| jnz .Lpolyval_horner_start |
| ret |
| |
| .Lpolyval_horner_start: |
| # We will start with L GFMULS for POLYVAL(BIG_BUFFER) |
| # RES = GFMUL(RES, H) |
| |
| xorq $LOC, $LOC |
| shlq \$4, $L # L contains number of bytes to process |
| |
| vmovdqa ($Hp), $H |
| vmovdqa ($T), $RES |
| |
| .Lpolyval_horner_loop: |
| vpxor ($INp,$LOC), $RES, $RES # RES = RES + Xi |
| call GFMUL # RES = RES * H |
| |
| add \$16, $LOC |
| cmp $LOC, $L |
| jne .Lpolyval_horner_loop |
| |
| # calculation of T is complete. RES=T |
| vmovdqa $RES, ($T) |
| ret |
| .cfi_endproc |
| .size aesgcmsiv_polyval_horner,.-aesgcmsiv_polyval_horner |
| ___ |
| } |
| aesgcmsiv_polyval_horner(); |
| |
| # void aes128gcmsiv_aes_ks(const uint8_t *key, uint8_t *out_expanded_key); |
| # parameter 1: %rdi |
| # parameter 2: %rsi |
| $code.=<<___; |
| .globl aes128gcmsiv_aes_ks |
| .type aes128gcmsiv_aes_ks,\@function,2 |
| .align 16 |
| aes128gcmsiv_aes_ks: |
| .cfi_startproc |
| vmovdqu (%rdi), %xmm1 # xmm1 = user key |
| vmovdqa %xmm1, (%rsi) # rsi points to output |
| |
| vmovdqa con1(%rip), %xmm0 |
| vmovdqa mask(%rip), %xmm15 |
| |
| movq \$8, %rax |
| |
| .Lks128_loop: |
| addq \$16, %rsi # rsi points for next key |
| subq \$1, %rax |
| vpshufb %xmm15, %xmm1, %xmm2 # xmm2 = shuffled user key |
| vaesenclast %xmm0, %xmm2, %xmm2 |
| vpslld \$1, %xmm0, %xmm0 |
| vpslldq \$4, %xmm1, %xmm3 |
| vpxor %xmm3, %xmm1, %xmm1 |
| vpslldq \$4, %xmm3, %xmm3 |
| vpxor %xmm3, %xmm1, %xmm1 |
| vpslldq \$4, %xmm3, %xmm3 |
| vpxor %xmm3, %xmm1, %xmm1 |
| vpxor %xmm2, %xmm1, %xmm1 |
| vmovdqa %xmm1, (%rsi) |
| jne .Lks128_loop |
| |
| vmovdqa con2(%rip), %xmm0 |
| vpshufb %xmm15, %xmm1, %xmm2 |
| vaesenclast %xmm0, %xmm2, %xmm2 |
| vpslld \$1, %xmm0, %xmm0 |
| vpslldq \$4, %xmm1, %xmm3 |
| vpxor %xmm3, %xmm1, %xmm1 |
| vpslldq \$4, %xmm3, %xmm3 |
| vpxor %xmm3, %xmm1, %xmm1 |
| vpslldq \$4, %xmm3, %xmm3 |
| vpxor %xmm3, %xmm1, %xmm1 |
| vpxor %xmm2, %xmm1, %xmm1 |
| vmovdqa %xmm1, 16(%rsi) |
| |
| vpshufb %xmm15, %xmm1, %xmm2 |
| vaesenclast %xmm0, %xmm2, %xmm2 |
| vpslldq \$4, %xmm1, %xmm3 |
| vpxor %xmm3, %xmm1, %xmm1 |
| vpslldq \$4, %xmm3, %xmm3 |
| vpxor %xmm3, %xmm1, %xmm1 |
| vpslldq \$4, %xmm3, %xmm3 |
| vpxor %xmm3, %xmm1, %xmm1 |
| vpxor %xmm2, %xmm1, %xmm1 |
| vmovdqa %xmm1, 32(%rsi) |
| ret |
| .cfi_endproc |
| .size aes128gcmsiv_aes_ks,.-aes128gcmsiv_aes_ks |
| ___ |
| |
| # void aes256gcmsiv_aes_ks(const uint8_t *key, uint8_t *out_expanded_key); |
| # parameter 1: %rdi |
| # parameter 2: %rsi |
| $code.=<<___; |
| .globl aes256gcmsiv_aes_ks |
| .type aes256gcmsiv_aes_ks,\@function,2 |
| .align 16 |
| aes256gcmsiv_aes_ks: |
| .cfi_startproc |
| vmovdqu (%rdi), %xmm1 |
| vmovdqu 16(%rdi), %xmm3 |
| vmovdqa %xmm1, (%rsi) |
| vmovdqa %xmm3, 16(%rsi) |
| vmovdqa con1(%rip), %xmm0 |
| vmovdqa mask(%rip), %xmm15 |
| vpxor %xmm14, %xmm14, %xmm14 |
| mov \$6, %rax |
| |
| .Lks256_loop: |
| add \$32, %rsi |
| subq \$1, %rax |
| vpshufb %xmm15, %xmm3, %xmm2 |
| vaesenclast %xmm0, %xmm2, %xmm2 |
| vpslld \$1, %xmm0, %xmm0 |
| vpsllq \$32, %xmm1, %xmm4 |
| vpxor %xmm4, %xmm1, %xmm1 |
| vpshufb con3(%rip), %xmm1, %xmm4 |
| vpxor %xmm4, %xmm1, %xmm1 |
| vpxor %xmm2, %xmm1, %xmm1 |
| vmovdqa %xmm1, (%rsi) |
| vpshufd \$0xff, %xmm1, %xmm2 |
| vaesenclast %xmm14, %xmm2, %xmm2 |
| vpsllq \$32, %xmm3, %xmm4 |
| vpxor %xmm4, %xmm3, %xmm3 |
| vpshufb con3(%rip), %xmm3, %xmm4 |
| vpxor %xmm4, %xmm3, %xmm3 |
| vpxor %xmm2, %xmm3, %xmm3 |
| vmovdqa %xmm3, 16(%rsi) |
| jne .Lks256_loop |
| |
| vpshufb %xmm15, %xmm3, %xmm2 |
| vaesenclast %xmm0, %xmm2, %xmm2 |
| vpsllq \$32, %xmm1, %xmm4 |
| vpxor %xmm4, %xmm1, %xmm1 |
| vpshufb con3(%rip), %xmm1, %xmm4 |
| vpxor %xmm4, %xmm1, %xmm1 |
| vpxor %xmm2, %xmm1, %xmm1 |
| vmovdqa %xmm1, 32(%rsi) |
| ret |
| .cfi_endproc |
| ___ |
| |
| sub aes128gcmsiv_aes_ks_enc_x1 { |
| my $KS1_REGA = "%xmm1"; |
| my $KS1_REGB = "%xmm2"; |
| my $BLOCK1 = "%xmm4"; |
| my $AUXREG = "%xmm3"; |
| |
| my $KS_BLOCK = sub { |
| my ($reg, $reg2, $auxReg) = @_; |
| return <<___; |
| vpsllq \$32, $reg, $auxReg #!!saving mov instruction to xmm3 |
| vpxor $auxReg, $reg, $reg |
| vpshufb con3(%rip), $reg, $auxReg |
| vpxor $auxReg, $reg, $reg |
| vpxor $reg2, $reg, $reg |
| ___ |
| }; |
| |
| my $round = sub { |
| my ($i, $j) = @_; |
| return <<___; |
| vpshufb %xmm15, %xmm1, %xmm2 #!!saving mov instruction to xmm2 |
| vaesenclast %xmm0, %xmm2, %xmm2 |
| vpslld \$1, %xmm0, %xmm0 |
| ${\$KS_BLOCK->($KS1_REGA, $KS1_REGB, $AUXREG)} |
| vaesenc %xmm1, $BLOCK1, $BLOCK1 |
| vmovdqa %xmm1, ${\eval(16*$i)}($j) |
| ___ |
| }; |
| |
| my $roundlast = sub { |
| my ($i, $j) = @_; |
| return <<___; |
| vpshufb %xmm15, %xmm1, %xmm2 #!!saving mov instruction to xmm2 |
| vaesenclast %xmm0, %xmm2, %xmm2 |
| ${\$KS_BLOCK->($KS1_REGA, $KS1_REGB, $AUXREG)} |
| vaesenclast %xmm1, $BLOCK1, $BLOCK1 |
| vmovdqa %xmm1, ${\eval(16*$i)}($j) |
| ___ |
| }; |
| |
| # parameter 1: %rdi Pointer to PT |
| # parameter 2: %rsi Pointer to CT |
| # parameter 4: %rdx Pointer to keys |
| # parameter 5: %rcx Pointer to initial key |
| $code.=<<___; |
| .globl aes128gcmsiv_aes_ks_enc_x1 |
| .type aes128gcmsiv_aes_ks_enc_x1,\@function,4 |
| .align 16 |
| aes128gcmsiv_aes_ks_enc_x1: |
| .cfi_startproc |
| vmovdqa (%rcx), %xmm1 # xmm1 = first 16 bytes of random key |
| vmovdqa 0*16(%rdi), $BLOCK1 |
| |
| vmovdqa %xmm1, (%rdx) # KEY[0] = first 16 bytes of random key |
| vpxor %xmm1, $BLOCK1, $BLOCK1 |
| |
| vmovdqa con1(%rip), %xmm0 # xmm0 = 1,1,1,1 |
| vmovdqa mask(%rip), %xmm15 # xmm15 = mask |
| |
| ${\$round->(1, "%rdx")} |
| ${\$round->(2, "%rdx")} |
| ${\$round->(3, "%rdx")} |
| ${\$round->(4, "%rdx")} |
| ${\$round->(5, "%rdx")} |
| ${\$round->(6, "%rdx")} |
| ${\$round->(7, "%rdx")} |
| ${\$round->(8, "%rdx")} |
| |
| vmovdqa con2(%rip), %xmm0 |
| |
| ${\$round->(9, "%rdx")} |
| ${\$roundlast->(10, "%rdx")} |
| |
| vmovdqa $BLOCK1, 0*16(%rsi) |
| ret |
| .cfi_endproc |
| .size aes128gcmsiv_aes_ks_enc_x1,.-aes128gcmsiv_aes_ks_enc_x1 |
| ___ |
| } |
| aes128gcmsiv_aes_ks_enc_x1(); |
| |
| sub aes128gcmsiv_kdf { |
| my $BLOCK1 = "%xmm9"; |
| my $BLOCK2 = "%xmm10"; |
| my $BLOCK3 = "%xmm11"; |
| my $BLOCK4 = "%xmm12"; |
| my $BLOCK5 = "%xmm13"; |
| my $BLOCK6 = "%xmm14"; |
| my $ONE = "%xmm13"; |
| my $KSp = "%rdx"; |
| my $STATE_1 = "%xmm1"; |
| |
| my $enc_roundx4 = sub { |
| my ($i, $j) = @_; |
| return <<___; |
| vmovdqa ${\eval($i*16)}(%rdx), $j |
| vaesenc $j, $BLOCK1, $BLOCK1 |
| vaesenc $j, $BLOCK2, $BLOCK2 |
| vaesenc $j, $BLOCK3, $BLOCK3 |
| vaesenc $j, $BLOCK4, $BLOCK4 |
| ___ |
| }; |
| |
| my $enc_roundlastx4 = sub { |
| my ($i, $j) = @_; |
| return <<___; |
| vmovdqa ${\eval($i*16)}(%rdx), $j |
| vaesenclast $j, $BLOCK1, $BLOCK1 |
| vaesenclast $j, $BLOCK2, $BLOCK2 |
| vaesenclast $j, $BLOCK3, $BLOCK3 |
| vaesenclast $j, $BLOCK4, $BLOCK4 |
| ___ |
| }; |
| |
| # void aes128gcmsiv_kdf(const uint8_t nonce[16], |
| # uint8_t *out_key_material, |
| # const uint8_t *key_schedule); |
| $code.=<<___; |
| .globl aes128gcmsiv_kdf |
| .type aes128gcmsiv_kdf,\@function,3 |
| .align 16 |
| aes128gcmsiv_kdf: |
| .cfi_startproc |
| # parameter 1: %rdi Pointer to NONCE |
| # parameter 2: %rsi Pointer to CT |
| # parameter 4: %rdx Pointer to keys |
| |
| vmovdqa (%rdx), %xmm1 # xmm1 = first 16 bytes of random key |
| vmovdqa 0*16(%rdi), $BLOCK1 |
| vmovdqa and_mask(%rip), $BLOCK4 |
| vmovdqa one(%rip), $ONE |
| vpshufd \$0x90, $BLOCK1, $BLOCK1 |
| vpand $BLOCK4, $BLOCK1, $BLOCK1 |
| vpaddd $ONE, $BLOCK1, $BLOCK2 |
| vpaddd $ONE, $BLOCK2, $BLOCK3 |
| vpaddd $ONE, $BLOCK3, $BLOCK4 |
| |
| vpxor %xmm1, $BLOCK1, $BLOCK1 |
| vpxor %xmm1, $BLOCK2, $BLOCK2 |
| vpxor %xmm1, $BLOCK3, $BLOCK3 |
| vpxor %xmm1, $BLOCK4, $BLOCK4 |
| |
| ${\$enc_roundx4->(1, "%xmm1")} |
| ${\$enc_roundx4->(2, "%xmm2")} |
| ${\$enc_roundx4->(3, "%xmm1")} |
| ${\$enc_roundx4->(4, "%xmm2")} |
| ${\$enc_roundx4->(5, "%xmm1")} |
| ${\$enc_roundx4->(6, "%xmm2")} |
| ${\$enc_roundx4->(7, "%xmm1")} |
| ${\$enc_roundx4->(8, "%xmm2")} |
| ${\$enc_roundx4->(9, "%xmm1")} |
| ${\$enc_roundlastx4->(10, "%xmm2")} |
| |
| vmovdqa $BLOCK1, 0*16(%rsi) |
| vmovdqa $BLOCK2, 1*16(%rsi) |
| vmovdqa $BLOCK3, 2*16(%rsi) |
| vmovdqa $BLOCK4, 3*16(%rsi) |
| ret |
| .cfi_endproc |
| .size aes128gcmsiv_kdf,.-aes128gcmsiv_kdf |
| ___ |
| } |
| aes128gcmsiv_kdf(); |
| |
| sub aes128gcmsiv_enc_msg_x4 { |
| my $CTR1 = "%xmm0"; |
| my $CTR2 = "%xmm1"; |
| my $CTR3 = "%xmm2"; |
| my $CTR4 = "%xmm3"; |
| my $ADDER = "%xmm4"; |
| |
| my $STATE1 = "%xmm5"; |
| my $STATE2 = "%xmm6"; |
| my $STATE3 = "%xmm7"; |
| my $STATE4 = "%xmm8"; |
| |
| my $TMP = "%xmm12"; |
| my $TMP2 = "%xmm13"; |
| my $TMP3 = "%xmm14"; |
| my $IV = "%xmm15"; |
| |
| my $PT = "%rdi"; |
| my $CT = "%rsi"; |
| my $TAG = "%rdx"; |
| my $KS = "%rcx"; |
| my $LEN = "%r8"; |
| |
| my $aes_round = sub { |
| my ($i) = @_; |
| return <<___; |
| vmovdqu ${\eval($i*16)}($KS), $TMP |
| vaesenc $TMP, $STATE1, $STATE1 |
| vaesenc $TMP, $STATE2, $STATE2 |
| vaesenc $TMP, $STATE3, $STATE3 |
| vaesenc $TMP, $STATE4, $STATE4 |
| ___ |
| }; |
| |
| my $aes_lastround = sub { |
| my ($i) = @_; |
| return <<___; |
| vmovdqu ${\eval($i*16)}($KS), $TMP |
| vaesenclast $TMP, $STATE1, $STATE1 |
| vaesenclast $TMP, $STATE2, $STATE2 |
| vaesenclast $TMP, $STATE3, $STATE3 |
| vaesenclast $TMP, $STATE4, $STATE4 |
| ___ |
| }; |
| |
| # void aes128gcmsiv_enc_msg_x4(unsigned char* PT, unsigned char* CT, |
| # unsigned char* TAG, unsigned char* KS, |
| # size_t byte_len); |
| # parameter 1: %rdi #PT |
| # parameter 2: %rsi #CT |
| # parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32] |
| # parameter 4: %rcx #KS |
| # parameter 5: %r8 #LEN MSG_length in bytes |
| $code.=<<___; |
| .globl aes128gcmsiv_enc_msg_x4 |
| .type aes128gcmsiv_enc_msg_x4,\@function,5 |
| .align 16 |
| aes128gcmsiv_enc_msg_x4: |
| .cfi_startproc |
| test $LEN, $LEN |
| jnz .L128_enc_msg_x4_start |
| ret |
| |
| .L128_enc_msg_x4_start: |
| pushq %r12 |
| .cfi_push %r12 |
| pushq %r13 |
| .cfi_push %r13 |
| |
| shrq \$4, $LEN # LEN = num of blocks |
| movq $LEN, %r10 |
| shlq \$62, %r10 |
| shrq \$62, %r10 |
| |
| # make IV from TAG |
| vmovdqa ($TAG), $IV |
| vpor OR_MASK(%rip), $IV, $IV #IV = [1]TAG[126...32][00..00] |
| |
| vmovdqu four(%rip), $ADDER # Register to increment counters |
| vmovdqa $IV, $CTR1 # CTR1 = TAG[1][127...32][00..00] |
| vpaddd one(%rip), $IV, $CTR2 # CTR2 = TAG[1][127...32][00..01] |
| vpaddd two(%rip), $IV, $CTR3 # CTR3 = TAG[1][127...32][00..02] |
| vpaddd three(%rip), $IV, $CTR4 # CTR4 = TAG[1][127...32][00..03] |
| |
| shrq \$2, $LEN |
| je .L128_enc_msg_x4_check_remainder |
| |
| subq \$64, $CT |
| subq \$64, $PT |
| |
| .L128_enc_msg_x4_loop1: |
| addq \$64, $CT |
| addq \$64, $PT |
| |
| vmovdqa $CTR1, $STATE1 |
| vmovdqa $CTR2, $STATE2 |
| vmovdqa $CTR3, $STATE3 |
| vmovdqa $CTR4, $STATE4 |
| |
| vpxor ($KS), $STATE1, $STATE1 |
| vpxor ($KS), $STATE2, $STATE2 |
| vpxor ($KS), $STATE3, $STATE3 |
| vpxor ($KS), $STATE4, $STATE4 |
| |
| ${\$aes_round->(1)} |
| vpaddd $ADDER, $CTR1, $CTR1 |
| ${\$aes_round->(2)} |
| vpaddd $ADDER, $CTR2, $CTR2 |
| ${\$aes_round->(3)} |
| vpaddd $ADDER, $CTR3, $CTR3 |
| ${\$aes_round->(4)} |
| vpaddd $ADDER, $CTR4, $CTR4 |
| |
| ${\$aes_round->(5)} |
| ${\$aes_round->(6)} |
| ${\$aes_round->(7)} |
| ${\$aes_round->(8)} |
| ${\$aes_round->(9)} |
| ${\$aes_lastround->(10)} |
| |
| # XOR with Plaintext |
| vpxor 0*16($PT), $STATE1, $STATE1 |
| vpxor 1*16($PT), $STATE2, $STATE2 |
| vpxor 2*16($PT), $STATE3, $STATE3 |
| vpxor 3*16($PT), $STATE4, $STATE4 |
| |
| subq \$1, $LEN |
| |
| vmovdqu $STATE1, 0*16($CT) |
| vmovdqu $STATE2, 1*16($CT) |
| vmovdqu $STATE3, 2*16($CT) |
| vmovdqu $STATE4, 3*16($CT) |
| |
| jne .L128_enc_msg_x4_loop1 |
| |
| addq \$64,$CT |
| addq \$64,$PT |
| |
| .L128_enc_msg_x4_check_remainder: |
| cmpq \$0, %r10 |
| je .L128_enc_msg_x4_out |
| |
| .L128_enc_msg_x4_loop2: |
| # enc each block separately |
| # CTR1 is the highest counter (even if no LOOP done) |
| vmovdqa $CTR1, $STATE1 |
| vpaddd one(%rip), $CTR1, $CTR1 # inc counter |
| |
| vpxor ($KS), $STATE1, $STATE1 |
| vaesenc 16($KS), $STATE1, $STATE1 |
| vaesenc 32($KS), $STATE1, $STATE1 |
| vaesenc 48($KS), $STATE1, $STATE1 |
| vaesenc 64($KS), $STATE1, $STATE1 |
| vaesenc 80($KS), $STATE1, $STATE1 |
| vaesenc 96($KS), $STATE1, $STATE1 |
| vaesenc 112($KS), $STATE1, $STATE1 |
| vaesenc 128($KS), $STATE1, $STATE1 |
| vaesenc 144($KS), $STATE1, $STATE1 |
| vaesenclast 160($KS), $STATE1, $STATE1 |
| |
| # XOR with plaintext |
| vpxor ($PT), $STATE1, $STATE1 |
| vmovdqu $STATE1, ($CT) |
| |
| addq \$16, $PT |
| addq \$16, $CT |
| |
| subq \$1, %r10 |
| jne .L128_enc_msg_x4_loop2 |
| |
| .L128_enc_msg_x4_out: |
| popq %r13 |
| .cfi_pop %r13 |
| popq %r12 |
| .cfi_pop %r12 |
| ret |
| .cfi_endproc |
| .size aes128gcmsiv_enc_msg_x4,.-aes128gcmsiv_enc_msg_x4 |
| ___ |
| } |
| aes128gcmsiv_enc_msg_x4(); |
| |
| sub aes128gcmsiv_enc_msg_x8 { |
| my $STATE1 = "%xmm1"; |
| my $STATE2 = "%xmm2"; |
| my $STATE3 = "%xmm3"; |
| my $STATE4 = "%xmm4"; |
| my $STATE5 = "%xmm5"; |
| my $STATE6 = "%xmm6"; |
| my $STATE7 = "%xmm7"; |
| my $STATE8 = "%xmm8"; |
| |
| my $CTR1 = "%xmm0"; |
| my $CTR2 = "%xmm9"; |
| my $CTR3 = "%xmm10"; |
| my $CTR4 = "%xmm11"; |
| my $CTR5 = "%xmm12"; |
| my $CTR6 = "%xmm13"; |
| my $CTR7 = "%xmm14"; |
| my $SCHED = "%xmm15"; |
| |
| my $TMP1 = "%xmm1"; |
| my $TMP2 = "%xmm2"; |
| |
| my $PT = "%rdi"; |
| my $CT = "%rsi"; |
| my $TAG = "%rdx"; |
| my $KS = "%rcx"; |
| my $LEN = "%r8"; |
| |
| my $aes_round8 = sub { |
| my ($i) = @_; |
| return <<___; |
| vmovdqu ${\eval($i*16)}($KS), $SCHED |
| vaesenc $SCHED, $STATE1, $STATE1 |
| vaesenc $SCHED, $STATE2, $STATE2 |
| vaesenc $SCHED, $STATE3, $STATE3 |
| vaesenc $SCHED, $STATE4, $STATE4 |
| vaesenc $SCHED, $STATE5, $STATE5 |
| vaesenc $SCHED, $STATE6, $STATE6 |
| vaesenc $SCHED, $STATE7, $STATE7 |
| vaesenc $SCHED, $STATE8, $STATE8 |
| ___ |
| }; |
| |
| my $aes_lastround8 = sub { |
| my ($i) = @_; |
| return <<___; |
| vmovdqu ${\eval($i*16)}($KS), $SCHED |
| vaesenclast $SCHED, $STATE1, $STATE1 |
| vaesenclast $SCHED, $STATE2, $STATE2 |
| vaesenclast $SCHED, $STATE3, $STATE3 |
| vaesenclast $SCHED, $STATE4, $STATE4 |
| vaesenclast $SCHED, $STATE5, $STATE5 |
| vaesenclast $SCHED, $STATE6, $STATE6 |
| vaesenclast $SCHED, $STATE7, $STATE7 |
| vaesenclast $SCHED, $STATE8, $STATE8 |
| ___ |
| }; |
| |
| # void ENC_MSG_x8(unsigned char* PT, |
| # unsigned char* CT, |
| # unsigned char* TAG, |
| # unsigned char* KS, |
| # size_t byte_len); |
| # parameter 1: %rdi #PT |
| # parameter 2: %rsi #CT |
| # parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32] |
| # parameter 4: %rcx #KS |
| # parameter 5: %r8 #LEN MSG_length in bytes |
| $code.=<<___; |
| .globl aes128gcmsiv_enc_msg_x8 |
| .type aes128gcmsiv_enc_msg_x8,\@function,5 |
| .align 16 |
| aes128gcmsiv_enc_msg_x8: |
| .cfi_startproc |
| test $LEN, $LEN |
| jnz .L128_enc_msg_x8_start |
| ret |
| |
| .L128_enc_msg_x8_start: |
| pushq %r12 |
| .cfi_push %r12 |
| pushq %r13 |
| .cfi_push %r13 |
| pushq %rbp |
| .cfi_push %rbp |
| movq %rsp, %rbp |
| .cfi_def_cfa_register rbp |
| |
| # Place in stack |
| subq \$128, %rsp |
| andq \$-64, %rsp |
| |
| shrq \$4, $LEN # LEN = num of blocks |
| movq $LEN, %r10 |
| shlq \$61, %r10 |
| shrq \$61, %r10 |
| |
| # make IV from TAG |
| vmovdqu ($TAG), $TMP1 |
| vpor OR_MASK(%rip), $TMP1, $TMP1 # TMP1= IV = [1]TAG[126...32][00..00] |
| |
| # store counter8 in the stack |
| vpaddd seven(%rip), $TMP1, $CTR1 |
| vmovdqu $CTR1, (%rsp) # CTR8 = TAG[127...32][00..07] |
| vpaddd one(%rip), $TMP1, $CTR2 # CTR2 = TAG[127...32][00..01] |
| vpaddd two(%rip), $TMP1, $CTR3 # CTR3 = TAG[127...32][00..02] |
| vpaddd three(%rip), $TMP1, $CTR4 # CTR4 = TAG[127...32][00..03] |
| vpaddd four(%rip), $TMP1, $CTR5 # CTR5 = TAG[127...32][00..04] |
| vpaddd five(%rip), $TMP1, $CTR6 # CTR6 = TAG[127...32][00..05] |
| vpaddd six(%rip), $TMP1, $CTR7 # CTR7 = TAG[127...32][00..06] |
| vmovdqa $TMP1, $CTR1 # CTR1 = TAG[127...32][00..00] |
| |
| shrq \$3, $LEN |
| je .L128_enc_msg_x8_check_remainder |
| |
| subq \$128, $CT |
| subq \$128, $PT |
| |
| .L128_enc_msg_x8_loop1: |
| addq \$128, $CT |
| addq \$128, $PT |
| |
| vmovdqa $CTR1, $STATE1 |
| vmovdqa $CTR2, $STATE2 |
| vmovdqa $CTR3, $STATE3 |
| vmovdqa $CTR4, $STATE4 |
| vmovdqa $CTR5, $STATE5 |
| vmovdqa $CTR6, $STATE6 |
| vmovdqa $CTR7, $STATE7 |
| # move from stack |
| vmovdqu (%rsp), $STATE8 |
| |
| vpxor ($KS), $STATE1, $STATE1 |
| vpxor ($KS), $STATE2, $STATE2 |
| vpxor ($KS), $STATE3, $STATE3 |
| vpxor ($KS), $STATE4, $STATE4 |
| vpxor ($KS), $STATE5, $STATE5 |
| vpxor ($KS), $STATE6, $STATE6 |
| vpxor ($KS), $STATE7, $STATE7 |
| vpxor ($KS), $STATE8, $STATE8 |
| |
| ${\$aes_round8->(1)} |
| vmovdqu (%rsp), $CTR7 # deal with CTR8 |
| vpaddd eight(%rip), $CTR7, $CTR7 |
| vmovdqu $CTR7, (%rsp) |
| ${\$aes_round8->(2)} |
| vpsubd one(%rip), $CTR7, $CTR7 |
| ${\$aes_round8->(3)} |
| vpaddd eight(%rip), $CTR1, $CTR1 |
| ${\$aes_round8->(4)} |
| vpaddd eight(%rip), $CTR2, $CTR2 |
| ${\$aes_round8->(5)} |
| vpaddd eight(%rip), $CTR3, $CTR3 |
| ${\$aes_round8->(6)} |
| vpaddd eight(%rip), $CTR4, $CTR4 |
| ${\$aes_round8->(7)} |
| vpaddd eight(%rip), $CTR5, $CTR5 |
| ${\$aes_round8->(8)} |
| vpaddd eight(%rip), $CTR6, $CTR6 |
| ${\$aes_round8->(9)} |
| ${\$aes_lastround8->(10)} |
| |
| # XOR with Plaintext |
| vpxor 0*16($PT), $STATE1, $STATE1 |
| vpxor 1*16($PT), $STATE2, $STATE2 |
| vpxor 2*16($PT), $STATE3, $STATE3 |
| vpxor 3*16($PT), $STATE4, $STATE4 |
| vpxor 4*16($PT), $STATE5, $STATE5 |
| vpxor 5*16($PT), $STATE6, $STATE6 |
| vpxor 6*16($PT), $STATE7, $STATE7 |
| vpxor 7*16($PT), $STATE8, $STATE8 |
| |
| dec $LEN |
| |
| vmovdqu $STATE1, 0*16($CT) |
| vmovdqu $STATE2, 1*16($CT) |
| vmovdqu $STATE3, 2*16($CT) |
| vmovdqu $STATE4, 3*16($CT) |
| vmovdqu $STATE5, 4*16($CT) |
| vmovdqu $STATE6, 5*16($CT) |
| vmovdqu $STATE7, 6*16($CT) |
| vmovdqu $STATE8, 7*16($CT) |
| |
| jne .L128_enc_msg_x8_loop1 |
| |
| addq \$128, $CT |
| addq \$128, $PT |
| |
| .L128_enc_msg_x8_check_remainder: |
| cmpq \$0, %r10 |
| je .L128_enc_msg_x8_out |
| |
| .L128_enc_msg_x8_loop2: |
| # enc each block separately |
| # CTR1 is the highest counter (even if no LOOP done) |
| vmovdqa $CTR1, $STATE1 |
| vpaddd one(%rip), $CTR1, $CTR1 # inc counter |
| |
| vpxor ($KS), $STATE1, $STATE1 |
| vaesenc 16($KS), $STATE1, $STATE1 |
| vaesenc 32($KS), $STATE1, $STATE1 |
| vaesenc 48($KS), $STATE1, $STATE1 |
| vaesenc 64($KS), $STATE1, $STATE1 |
| vaesenc 80($KS), $STATE1, $STATE1 |
| vaesenc 96($KS), $STATE1, $STATE1 |
| vaesenc 112($KS), $STATE1, $STATE1 |
| vaesenc 128($KS), $STATE1, $STATE1 |
| vaesenc 144($KS), $STATE1, $STATE1 |
| vaesenclast 160($KS), $STATE1, $STATE1 |
| |
| # XOR with Plaintext |
| vpxor ($PT), $STATE1, $STATE1 |
| |
| vmovdqu $STATE1, ($CT) |
| |
| addq \$16, $PT |
| addq \$16, $CT |
| |
| decq %r10 |
| jne .L128_enc_msg_x8_loop2 |
| |
| .L128_enc_msg_x8_out: |
| movq %rbp, %rsp |
| .cfi_def_cfa_register %rsp |
| popq %rbp |
| .cfi_pop %rbp |
| popq %r13 |
| .cfi_pop %r13 |
| popq %r12 |
| .cfi_pop %r12 |
| ret |
| .cfi_endproc |
| .size aes128gcmsiv_enc_msg_x8,.-aes128gcmsiv_enc_msg_x8 |
| ___ |
| } |
| aes128gcmsiv_enc_msg_x8(); |
| |
| sub aesgcmsiv_dec { |
| my ($aes256) = @_; |
| |
| my $T = "%xmm0"; |
| my $TMP0 = "%xmm1"; |
| my $TMP1 = "%xmm2"; |
| my $TMP2 = "%xmm3"; |
| my $TMP3 = "%xmm4"; |
| my $TMP4 = "%xmm5"; |
| my $TMP5 = "%xmm6"; |
| my $CTR1 = "%xmm7"; |
| my $CTR2 = "%xmm8"; |
| my $CTR3 = "%xmm9"; |
| my $CTR4 = "%xmm10"; |
| my $CTR5 = "%xmm11"; |
| my $CTR6 = "%xmm12"; |
| my $CTR = "%xmm15"; |
| my $CT = "%rdi"; |
| my $PT = "%rsi"; |
| my $POL = "%rdx"; |
| my $Htbl = "%rcx"; |
| my $KS = "%r8"; |
| my $LEN = "%r9"; |
| my $secureBuffer = "%rax"; |
| my $HTABLE_ROUNDS = "%xmm13"; |
| |
| my $labelPrefix = "128"; |
| if ($aes256) { |
| $labelPrefix = "256"; |
| } |
| |
| my $aes_round_dec = sub { |
| my ($i) = @_; |
| return <<___; |
| vmovdqu ${\eval($i*16)}($KS), $TMP3 |
| vaesenc $TMP3, $CTR1, $CTR1 |
| vaesenc $TMP3, $CTR2, $CTR2 |
| vaesenc $TMP3, $CTR3, $CTR3 |
| vaesenc $TMP3, $CTR4, $CTR4 |
| vaesenc $TMP3, $CTR5, $CTR5 |
| vaesenc $TMP3, $CTR6, $CTR6 |
| ___ |
| }; |
| |
| my $aes_lastround_dec = sub { |
| my ($i) = @_; |
| return <<___; |
| vmovdqu ${\eval($i*16)}($KS), $TMP3 |
| vaesenclast $TMP3, $CTR1, $CTR1 |
| vaesenclast $TMP3, $CTR2, $CTR2 |
| vaesenclast $TMP3, $CTR3, $CTR3 |
| vaesenclast $TMP3, $CTR4, $CTR4 |
| vaesenclast $TMP3, $CTR5, $CTR5 |
| vaesenclast $TMP3, $CTR6, $CTR6 |
| ___ |
| }; |
| |
| my $schoolbook = sub { |
| my ($i) = @_; |
| return <<___; |
| vmovdqu ${\eval($i*16-32)}($secureBuffer), $TMP5 |
| vmovdqu ${\eval($i*16-32)}($Htbl), $HTABLE_ROUNDS |
| |
| vpclmulqdq \$0x10, $HTABLE_ROUNDS, $TMP5, $TMP3 |
| vpxor $TMP3, $TMP0, $TMP0 |
| vpclmulqdq \$0x11, $HTABLE_ROUNDS, $TMP5, $TMP3 |
| vpxor $TMP3, $TMP1, $TMP1 |
| vpclmulqdq \$0x00, $HTABLE_ROUNDS, $TMP5, $TMP3 |
| vpxor $TMP3, $TMP2, $TMP2 |
| vpclmulqdq \$0x01, $HTABLE_ROUNDS, $TMP5, $TMP3 |
| vpxor $TMP3, $TMP0, $TMP0 |
| ___ |
| }; |
| |
| if ($aes256) { |
| $code.=<<___; |
| .globl aes256gcmsiv_dec |
| .type aes256gcmsiv_dec,\@function,6 |
| .align 16 |
| aes256gcmsiv_dec: |
| ___ |
| } else { |
| $code.=<<___; |
| .globl aes128gcmsiv_dec |
| .type aes128gcmsiv_dec,\@function,6 |
| .align 16 |
| aes128gcmsiv_dec: |
| ___ |
| } |
| |
| $code.=<<___; |
| .cfi_startproc |
| test \$~15, $LEN |
| jnz .L${labelPrefix}_dec_start |
| ret |
| |
| .L${labelPrefix}_dec_start: |
| vzeroupper |
| vmovdqa ($POL), $T |
| movq $POL, $secureBuffer |
| |
| leaq 32($secureBuffer), $secureBuffer |
| leaq 32($Htbl), $Htbl |
| |
| # make CTRBLKs from given tag. |
| vmovdqu ($CT,$LEN), $CTR |
| vpor OR_MASK(%rip), $CTR, $CTR # CTR = [1]TAG[126...32][00..00] |
| andq \$~15, $LEN |
| |
| # If less then 6 blocks, make singles |
| cmp \$96, $LEN |
| jb .L${labelPrefix}_dec_loop2 |
| |
| # Decrypt the first six blocks |
| sub \$96, $LEN |
| vmovdqa $CTR, $CTR1 |
| vpaddd one(%rip), $CTR1, $CTR2 |
| vpaddd two(%rip), $CTR1, $CTR3 |
| vpaddd one(%rip), $CTR3, $CTR4 |
| vpaddd two(%rip), $CTR3, $CTR5 |
| vpaddd one(%rip), $CTR5, $CTR6 |
| vpaddd two(%rip), $CTR5, $CTR |
| |
| vpxor ($KS), $CTR1, $CTR1 |
| vpxor ($KS), $CTR2, $CTR2 |
| vpxor ($KS), $CTR3, $CTR3 |
| vpxor ($KS), $CTR4, $CTR4 |
| vpxor ($KS), $CTR5, $CTR5 |
| vpxor ($KS), $CTR6, $CTR6 |
| |
| ${\$aes_round_dec->(1)} |
| ${\$aes_round_dec->(2)} |
| ${\$aes_round_dec->(3)} |
| ${\$aes_round_dec->(4)} |
| ${\$aes_round_dec->(5)} |
| ${\$aes_round_dec->(6)} |
| ${\$aes_round_dec->(7)} |
| ${\$aes_round_dec->(8)} |
| ${\$aes_round_dec->(9)} |
| ___ |
| |
| if ($aes256) { |
| $code.=<<___; |
| ${\$aes_round_dec->(10)} |
| ${\$aes_round_dec->(11)} |
| ${\$aes_round_dec->(12)} |
| ${\$aes_round_dec->(13)} |
| ${\$aes_lastround_dec->(14)} |
| ___ |
| } else { |
| $code.=<<___; |
| ${\$aes_lastround_dec->(10)} |
| ___ |
| } |
| |
| $code.=<<___; |
| # XOR with CT |
| vpxor 0*16($CT), $CTR1, $CTR1 |
| vpxor 1*16($CT), $CTR2, $CTR2 |
| vpxor 2*16($CT), $CTR3, $CTR3 |
| vpxor 3*16($CT), $CTR4, $CTR4 |
| vpxor 4*16($CT), $CTR5, $CTR5 |
| vpxor 5*16($CT), $CTR6, $CTR6 |
| |
| vmovdqu $CTR1, 0*16($PT) |
| vmovdqu $CTR2, 1*16($PT) |
| vmovdqu $CTR3, 2*16($PT) |
| vmovdqu $CTR4, 3*16($PT) |
| vmovdqu $CTR5, 4*16($PT) |
| vmovdqu $CTR6, 5*16($PT) |
| |
| addq \$96, $CT |
| addq \$96, $PT |
| jmp .L${labelPrefix}_dec_loop1 |
| |
| # Decrypt 6 blocks each time while hashing previous 6 blocks |
| .align 64 |
| .L${labelPrefix}_dec_loop1: |
| cmp \$96, $LEN |
| jb .L${labelPrefix}_dec_finish_96 |
| sub \$96, $LEN |
| |
| vmovdqa $CTR6, $TMP5 |
| vmovdqa $CTR5, 1*16-32($secureBuffer) |
| vmovdqa $CTR4, 2*16-32($secureBuffer) |
| vmovdqa $CTR3, 3*16-32($secureBuffer) |
| vmovdqa $CTR2, 4*16-32($secureBuffer) |
| vmovdqa $CTR1, 5*16-32($secureBuffer) |
| |
| vmovdqa $CTR, $CTR1 |
| vpaddd one(%rip), $CTR1, $CTR2 |
| vpaddd two(%rip), $CTR1, $CTR3 |
| vpaddd one(%rip), $CTR3, $CTR4 |
| vpaddd two(%rip), $CTR3, $CTR5 |
| vpaddd one(%rip), $CTR5, $CTR6 |
| vpaddd two(%rip), $CTR5, $CTR |
| |
| vmovdqa ($KS), $TMP3 |
| vpxor $TMP3, $CTR1, $CTR1 |
| vpxor $TMP3, $CTR2, $CTR2 |
| vpxor $TMP3, $CTR3, $CTR3 |
| vpxor $TMP3, $CTR4, $CTR4 |
| vpxor $TMP3, $CTR5, $CTR5 |
| vpxor $TMP3, $CTR6, $CTR6 |
| |
| vmovdqu 0*16-32($Htbl), $TMP3 |
| vpclmulqdq \$0x11, $TMP3, $TMP5, $TMP1 |
| vpclmulqdq \$0x00, $TMP3, $TMP5, $TMP2 |
| vpclmulqdq \$0x01, $TMP3, $TMP5, $TMP0 |
| vpclmulqdq \$0x10, $TMP3, $TMP5, $TMP3 |
| vpxor $TMP3, $TMP0, $TMP0 |
| |
| ${\$aes_round_dec->(1)} |
| ${\$schoolbook->(1)} |
| |
| ${\$aes_round_dec->(2)} |
| ${\$schoolbook->(2)} |
| |
| ${\$aes_round_dec->(3)} |
| ${\$schoolbook->(3)} |
| |
| ${\$aes_round_dec->(4)} |
| ${\$schoolbook->(4)} |
| |
| ${\$aes_round_dec->(5)} |
| ${\$aes_round_dec->(6)} |
| ${\$aes_round_dec->(7)} |
| |
| vmovdqa 5*16-32($secureBuffer), $TMP5 |
| vpxor $T, $TMP5, $TMP5 |
| vmovdqu 5*16-32($Htbl), $TMP4 |
| |
| vpclmulqdq \$0x01, $TMP4, $TMP5, $TMP3 |
| vpxor $TMP3, $TMP0, $TMP0 |
| vpclmulqdq \$0x11, $TMP4, $TMP5, $TMP3 |
| vpxor $TMP3, $TMP1, $TMP1 |
| vpclmulqdq \$0x00, $TMP4, $TMP5, $TMP3 |
| vpxor $TMP3, $TMP2, $TMP2 |
| vpclmulqdq \$0x10, $TMP4, $TMP5, $TMP3 |
| vpxor $TMP3, $TMP0, $TMP0 |
| |
| ${\$aes_round_dec->(8)} |
| |
| vpsrldq \$8, $TMP0, $TMP3 |
| vpxor $TMP3, $TMP1, $TMP4 |
| vpslldq \$8, $TMP0, $TMP3 |
| vpxor $TMP3, $TMP2, $T |
| |
| vmovdqa poly(%rip), $TMP2 |
| |
| ${\$aes_round_dec->(9)} |
| ___ |
| |
| if ($aes256) { |
| $code.=<<___; |
| ${\$aes_round_dec->(10)} |
| ${\$aes_round_dec->(11)} |
| ${\$aes_round_dec->(12)} |
| ${\$aes_round_dec->(13)} |
| vmovdqu 14*16($KS), $TMP5 |
| ___ |
| } else { |
| $code.=<<___; |
| vmovdqu 10*16($KS), $TMP5 |
| ___ |
| } |
| |
| $code.=<<___; |
| vpalignr \$8, $T, $T, $TMP1 |
| vpclmulqdq \$0x10, $TMP2, $T, $T |
| vpxor $T, $TMP1, $T |
| |
| vpxor 0*16($CT), $TMP5, $TMP3 |
| vaesenclast $TMP3, $CTR1, $CTR1 |
| vpxor 1*16($CT), $TMP5, $TMP3 |
| vaesenclast $TMP3, $CTR2, $CTR2 |
| vpxor 2*16($CT), $TMP5, $TMP3 |
| vaesenclast $TMP3, $CTR3, $CTR3 |
| vpxor 3*16($CT), $TMP5, $TMP3 |
| vaesenclast $TMP3, $CTR4, $CTR4 |
| vpxor 4*16($CT), $TMP5, $TMP3 |
| vaesenclast $TMP3, $CTR5, $CTR5 |
| vpxor 5*16($CT), $TMP5, $TMP3 |
| vaesenclast $TMP3, $CTR6, $CTR6 |
| |
| vpalignr \$8, $T, $T, $TMP1 |
| vpclmulqdq \$0x10, $TMP2, $T, $T |
| vpxor $T, $TMP1, $T |
| |
| vmovdqu $CTR1, 0*16($PT) |
| vmovdqu $CTR2, 1*16($PT) |
| vmovdqu $CTR3, 2*16($PT) |
| vmovdqu $CTR4, 3*16($PT) |
| vmovdqu $CTR5, 4*16($PT) |
| vmovdqu $CTR6, 5*16($PT) |
| |
| vpxor $TMP4, $T, $T |
| |
| lea 96($CT), $CT |
| lea 96($PT), $PT |
| jmp .L${labelPrefix}_dec_loop1 |
| |
| .L${labelPrefix}_dec_finish_96: |
| vmovdqa $CTR6, $TMP5 |
| vmovdqa $CTR5, 1*16-32($secureBuffer) |
| vmovdqa $CTR4, 2*16-32($secureBuffer) |
| vmovdqa $CTR3, 3*16-32($secureBuffer) |
| vmovdqa $CTR2, 4*16-32($secureBuffer) |
| vmovdqa $CTR1, 5*16-32($secureBuffer) |
| |
| vmovdqu 0*16-32($Htbl), $TMP3 |
| vpclmulqdq \$0x10, $TMP3, $TMP5, $TMP0 |
| vpclmulqdq \$0x11, $TMP3, $TMP5, $TMP1 |
| vpclmulqdq \$0x00, $TMP3, $TMP5, $TMP2 |
| vpclmulqdq \$0x01, $TMP3, $TMP5, $TMP3 |
| vpxor $TMP3, $TMP0, $TMP0 |
| |
| ${\$schoolbook->(1)} |
| ${\$schoolbook->(2)} |
| ${\$schoolbook->(3)} |
| ${\$schoolbook->(4)} |
| |
| vmovdqu 5*16-32($secureBuffer), $TMP5 |
| vpxor $T, $TMP5, $TMP5 |
| vmovdqu 5*16-32($Htbl), $TMP4 |
| vpclmulqdq \$0x11, $TMP4, $TMP5, $TMP3 |
| vpxor $TMP3, $TMP1, $TMP1 |
| vpclmulqdq \$0x00, $TMP4, $TMP5, $TMP3 |
| vpxor $TMP3, $TMP2, $TMP2 |
| vpclmulqdq \$0x10, $TMP4, $TMP5, $TMP3 |
| vpxor $TMP3, $TMP0, $TMP0 |
| vpclmulqdq \$0x01, $TMP4, $TMP5, $TMP3 |
| vpxor $TMP3, $TMP0, $TMP0 |
| |
| vpsrldq \$8, $TMP0, $TMP3 |
| vpxor $TMP3, $TMP1, $TMP4 |
| vpslldq \$8, $TMP0, $TMP3 |
| vpxor $TMP3, $TMP2, $T |
| |
| vmovdqa poly(%rip), $TMP2 |
| |
| vpalignr \$8, $T, $T, $TMP1 |
| vpclmulqdq \$0x10, $TMP2, $T, $T |
| vpxor $T, $TMP1, $T |
| |
| vpalignr \$8, $T, $T, $TMP1 |
| vpclmulqdq \$0x10, $TMP2, $T, $T |
| vpxor $T, $TMP1, $T |
| |
| vpxor $TMP4, $T, $T |
| |
| .L${labelPrefix}_dec_loop2: |
| # Here we encrypt any remaining whole block |
| |
| # if there are no whole blocks |
| cmp \$16, $LEN |
| jb .L${labelPrefix}_dec_out |
| sub \$16, $LEN |
| |
| vmovdqa $CTR, $TMP1 |
| vpaddd one(%rip), $CTR, $CTR |
| |
| vpxor 0*16($KS), $TMP1, $TMP1 |
| vaesenc 1*16($KS), $TMP1, $TMP1 |
| vaesenc 2*16($KS), $TMP1, $TMP1 |
| vaesenc 3*16($KS), $TMP1, $TMP1 |
| vaesenc 4*16($KS), $TMP1, $TMP1 |
| vaesenc 5*16($KS), $TMP1, $TMP1 |
| vaesenc 6*16($KS), $TMP1, $TMP1 |
| vaesenc 7*16($KS), $TMP1, $TMP1 |
| vaesenc 8*16($KS), $TMP1, $TMP1 |
| vaesenc 9*16($KS), $TMP1, $TMP1 |
| ___ |
| if ($aes256) { |
| $code.=<<___; |
| vaesenc 10*16($KS), $TMP1, $TMP1 |
| vaesenc 11*16($KS), $TMP1, $TMP1 |
| vaesenc 12*16($KS), $TMP1, $TMP1 |
| vaesenc 13*16($KS), $TMP1, $TMP1 |
| vaesenclast 14*16($KS), $TMP1, $TMP1 |
| ___ |
| } else { |
| $code.=<<___; |
| vaesenclast 10*16($KS), $TMP1, $TMP1 |
| ___ |
| } |
| |
| $code.=<<___; |
| vpxor ($CT), $TMP1, $TMP1 |
| vmovdqu $TMP1, ($PT) |
| addq \$16, $CT |
| addq \$16, $PT |
| |
| vpxor $TMP1, $T, $T |
| vmovdqa -32($Htbl), $TMP0 |
| call GFMUL |
| |
| jmp .L${labelPrefix}_dec_loop2 |
| |
| .L${labelPrefix}_dec_out: |
| vmovdqu $T, ($POL) |
| ret |
| .cfi_endproc |
| ___ |
| |
| if ($aes256) { |
| $code.=<<___; |
| .size aes256gcmsiv_dec, .-aes256gcmsiv_dec |
| ___ |
| } else { |
| $code.=<<___; |
| .size aes128gcmsiv_dec, .-aes128gcmsiv_dec |
| ___ |
| } |
| } |
| |
| aesgcmsiv_dec(0); # emit 128-bit version |
| |
| sub aes128gcmsiv_ecb_enc_block { |
| my $STATE_1 = "%xmm1"; |
| my $KSp = "%rdx"; |
| |
| # parameter 1: PT %rdi (pointer to 128 bit) |
| # parameter 2: CT %rsi (pointer to 128 bit) |
| # parameter 3: ks %rdx (pointer to ks) |
| $code.=<<___; |
| .globl aes128gcmsiv_ecb_enc_block |
| .type aes128gcmsiv_ecb_enc_block,\@function,3 |
| .align 16 |
| aes128gcmsiv_ecb_enc_block: |
| .cfi_startproc |
| vmovdqa (%rdi), $STATE_1 |
| |
| vpxor ($KSp), $STATE_1, $STATE_1 |
| vaesenc 1*16($KSp), $STATE_1, $STATE_1 |
| vaesenc 2*16($KSp), $STATE_1, $STATE_1 |
| vaesenc 3*16($KSp), $STATE_1, $STATE_1 |
| vaesenc 4*16($KSp), $STATE_1, $STATE_1 |
| vaesenc 5*16($KSp), $STATE_1, $STATE_1 |
| vaesenc 6*16($KSp), $STATE_1, $STATE_1 |
| vaesenc 7*16($KSp), $STATE_1, $STATE_1 |
| vaesenc 8*16($KSp), $STATE_1, $STATE_1 |
| vaesenc 9*16($KSp), $STATE_1, $STATE_1 |
| vaesenclast 10*16($KSp), $STATE_1, $STATE_1 # STATE_1 == IV |
| |
| vmovdqa $STATE_1, (%rsi) |
| |
| ret |
| .cfi_endproc |
| .size aes128gcmsiv_ecb_enc_block,.-aes128gcmsiv_ecb_enc_block |
| ___ |
| } |
| aes128gcmsiv_ecb_enc_block(); |
| |
| sub aes256gcmsiv_aes_ks_enc_x1 { |
| my $KS = "%rdx"; |
| my $KEYp = "%rcx"; |
| my $CON_MASK = "%xmm0"; |
| my $MASK_256 = "%xmm15"; |
| my $KEY_1 = "%xmm1"; |
| my $KEY_2 = "%xmm3"; |
| my $BLOCK1 = "%xmm8"; |
| my $AUX_REG = "%xmm14"; |
| my $PT = "%rdi"; |
| my $CT = "%rsi"; |
| |
| my $round_double = sub { |
| my ($i, $j) = @_; |
| return <<___; |
| vpshufb %xmm15, %xmm3, %xmm2 |
| vaesenclast %xmm0, %xmm2, %xmm2 |
| vpslld \$1, %xmm0, %xmm0 |
| vpslldq \$4, %xmm1, %xmm4 |
| vpxor %xmm4, %xmm1, %xmm1 |
| vpslldq \$4, %xmm4, %xmm4 |
| vpxor %xmm4, %xmm1, %xmm1 |
| vpslldq \$4, %xmm4, %xmm4 |
| vpxor %xmm4, %xmm1, %xmm1 |
| vpxor %xmm2, %xmm1, %xmm1 |
| vaesenc %xmm1, $BLOCK1, $BLOCK1 |
| vmovdqu %xmm1, ${\eval(16*$i)}($KS) |
| |
| vpshufd \$0xff, %xmm1, %xmm2 |
| vaesenclast %xmm14, %xmm2, %xmm2 |
| vpslldq \$4, %xmm3, %xmm4 |
| vpxor %xmm4, %xmm3, %xmm3 |
| vpslldq \$4, %xmm4, %xmm4 |
| vpxor %xmm4, %xmm3, %xmm3 |
| vpslldq \$4, %xmm4, %xmm4 |
| vpxor %xmm4, %xmm3, %xmm3 |
| vpxor %xmm2, %xmm3, %xmm3 |
| vaesenc %xmm3, $BLOCK1, $BLOCK1 |
| vmovdqu %xmm3, ${\eval(16*$j)}($KS) |
| ___ |
| }; |
| |
| my $round_last = sub { |
| my ($i) = @_; |
| return <<___; |
| vpshufb %xmm15, %xmm3, %xmm2 |
| vaesenclast %xmm0, %xmm2, %xmm2 |
| vpslldq \$4, %xmm1, %xmm4 |
| vpxor %xmm4, %xmm1, %xmm1 |
| vpslldq \$4, %xmm4, %xmm4 |
| vpxor %xmm4, %xmm1, %xmm1 |
| vpslldq \$4, %xmm4, %xmm4 |
| vpxor %xmm4, %xmm1, %xmm1 |
| vpxor %xmm2, %xmm1, %xmm1 |
| vaesenclast %xmm1, $BLOCK1, $BLOCK1 |
| vmovdqu %xmm1, ${\eval(16*$i)}($KS) |
| ___ |
| }; |
| |
| # parameter 1: %rdi Pointer to PT1 |
| # parameter 2: %rsi Pointer to CT1 |
| # parameter 3: %rdx Pointer to KS |
| # parameter 4: %rcx Pointer to initial key |
| $code.=<<___; |
| .globl aes256gcmsiv_aes_ks_enc_x1 |
| .type aes256gcmsiv_aes_ks_enc_x1,\@function,4 |
| .align 16 |
| aes256gcmsiv_aes_ks_enc_x1: |
| .cfi_startproc |
| vmovdqa con1(%rip), $CON_MASK # CON_MASK = 1,1,1,1 |
| vmovdqa mask(%rip), $MASK_256 # MASK_256 |
| vmovdqa ($PT), $BLOCK1 |
| vmovdqa ($KEYp), $KEY_1 # KEY_1 || KEY_2 [0..7] = user key |
| vmovdqa 16($KEYp), $KEY_2 |
| vpxor $KEY_1, $BLOCK1, $BLOCK1 |
| vaesenc $KEY_2, $BLOCK1, $BLOCK1 |
| vmovdqu $KEY_1, ($KS) # First round key |
| vmovdqu $KEY_2, 16($KS) |
| vpxor $AUX_REG, $AUX_REG, $AUX_REG |
| |
| ${\$round_double->(2, 3)} |
| ${\$round_double->(4, 5)} |
| ${\$round_double->(6, 7)} |
| ${\$round_double->(8, 9)} |
| ${\$round_double->(10, 11)} |
| ${\$round_double->(12, 13)} |
| ${\$round_last->(14)} |
| vmovdqa $BLOCK1, ($CT) |
| ret |
| .cfi_endproc |
| .size aes256gcmsiv_aes_ks_enc_x1,.-aes256gcmsiv_aes_ks_enc_x1 |
| ___ |
| } |
| aes256gcmsiv_aes_ks_enc_x1(); |
| |
| sub aes256gcmsiv_ecb_enc_block { |
| my $STATE_1 = "%xmm1"; |
| my $PT = "%rdi"; |
| my $CT = "%rsi"; |
| my $KSp = "%rdx"; |
| |
| # parameter 1: PT %rdi (pointer to 128 bit) |
| # parameter 2: CT %rsi (pointer to 128 bit) |
| # parameter 3: ks %rdx (pointer to ks) |
| $code.=<<___; |
| .globl aes256gcmsiv_ecb_enc_block |
| .type aes256gcmsiv_ecb_enc_block,\@function,3 |
| .align 16 |
| aes256gcmsiv_ecb_enc_block: |
| .cfi_startproc |
| vmovdqa (%rdi), $STATE_1 |
| vpxor ($KSp), $STATE_1, $STATE_1 |
| vaesenc 1*16($KSp), $STATE_1, $STATE_1 |
| vaesenc 2*16($KSp), $STATE_1, $STATE_1 |
| vaesenc 3*16($KSp), $STATE_1, $STATE_1 |
| vaesenc 4*16($KSp), $STATE_1, $STATE_1 |
| vaesenc 5*16($KSp), $STATE_1, $STATE_1 |
| vaesenc 6*16($KSp), $STATE_1, $STATE_1 |
| vaesenc 7*16($KSp), $STATE_1, $STATE_1 |
| vaesenc 8*16($KSp), $STATE_1, $STATE_1 |
| vaesenc 9*16($KSp), $STATE_1, $STATE_1 |
| vaesenc 10*16($KSp), $STATE_1, $STATE_1 |
| vaesenc 11*16($KSp), $STATE_1, $STATE_1 |
| vaesenc 12*16($KSp), $STATE_1, $STATE_1 |
| vaesenc 13*16($KSp), $STATE_1, $STATE_1 |
| vaesenclast 14*16($KSp), $STATE_1, $STATE_1 # $STATE_1 == IV |
| vmovdqa $STATE_1, (%rsi) |
| ret |
| .cfi_endproc |
| .size aes256gcmsiv_ecb_enc_block,.-aes256gcmsiv_ecb_enc_block |
| ___ |
| } |
| aes256gcmsiv_ecb_enc_block(); |
| |
| sub aes256gcmsiv_enc_msg_x4 { |
| my $CTR1 = "%xmm0"; |
| my $CTR2 = "%xmm1"; |
| my $CTR3 = "%xmm2"; |
| my $CTR4 = "%xmm3"; |
| my $ADDER = "%xmm4"; |
| |
| my $STATE1 = "%xmm5"; |
| my $STATE2 = "%xmm6"; |
| my $STATE3 = "%xmm7"; |
| my $STATE4 = "%xmm8"; |
| |
| my $TMP = "%xmm12"; |
| my $TMP2 = "%xmm13"; |
| my $TMP3 = "%xmm14"; |
| my $IV = "%xmm15"; |
| |
| my $PT = "%rdi"; |
| my $CT = "%rsi"; |
| my $TAG = "%rdx"; |
| my $KS = "%rcx"; |
| my $LEN = "%r8"; |
| |
| my $aes_round = sub { |
| my ($i) = @_; |
| return <<___; |
| vmovdqu ${\eval($i*16)}($KS), $TMP |
| vaesenc $TMP, $STATE1, $STATE1 |
| vaesenc $TMP, $STATE2, $STATE2 |
| vaesenc $TMP, $STATE3, $STATE3 |
| vaesenc $TMP, $STATE4, $STATE4 |
| ___ |
| }; |
| |
| my $aes_lastround = sub { |
| my ($i) = @_; |
| return <<___; |
| vmovdqu ${\eval($i*16)}($KS), $TMP |
| vaesenclast $TMP, $STATE1, $STATE1 |
| vaesenclast $TMP, $STATE2, $STATE2 |
| vaesenclast $TMP, $STATE3, $STATE3 |
| vaesenclast $TMP, $STATE4, $STATE4 |
| ___ |
| }; |
| |
| # void aes256gcmsiv_enc_msg_x4(unsigned char* PT, unsigned char* CT, |
| # unsigned char* TAG, unsigned char* KS, |
| # size_t byte_len); |
| # parameter 1: %rdi #PT |
| # parameter 2: %rsi #CT |
| # parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32] |
| # parameter 4: %rcx #KS |
| # parameter 5: %r8 #LEN MSG_length in bytes |
| $code.=<<___; |
| .globl aes256gcmsiv_enc_msg_x4 |
| .type aes256gcmsiv_enc_msg_x4,\@function,5 |
| .align 16 |
| aes256gcmsiv_enc_msg_x4: |
| .cfi_startproc |
| test $LEN, $LEN |
| jnz .L256_enc_msg_x4_start |
| ret |
| |
| .L256_enc_msg_x4_start: |
| movq $LEN, %r10 |
| shrq \$4, $LEN # LEN = num of blocks |
| shlq \$60, %r10 |
| jz .L256_enc_msg_x4_start2 |
| addq \$1, $LEN |
| |
| .L256_enc_msg_x4_start2: |
| movq $LEN, %r10 |
| shlq \$62, %r10 |
| shrq \$62, %r10 |
| |
| # make IV from TAG |
| vmovdqa ($TAG), $IV |
| vpor OR_MASK(%rip), $IV, $IV # IV = [1]TAG[126...32][00..00] |
| |
| vmovdqa four(%rip), $ADDER # Register to increment counters |
| vmovdqa $IV, $CTR1 # CTR1 = TAG[1][127...32][00..00] |
| vpaddd one(%rip), $IV, $CTR2 # CTR2 = TAG[1][127...32][00..01] |
| vpaddd two(%rip), $IV, $CTR3 # CTR3 = TAG[1][127...32][00..02] |
| vpaddd three(%rip), $IV, $CTR4 # CTR4 = TAG[1][127...32][00..03] |
| |
| shrq \$2, $LEN |
| je .L256_enc_msg_x4_check_remainder |
| |
| subq \$64, $CT |
| subq \$64, $PT |
| |
| .L256_enc_msg_x4_loop1: |
| addq \$64, $CT |
| addq \$64, $PT |
| |
| vmovdqa $CTR1, $STATE1 |
| vmovdqa $CTR2, $STATE2 |
| vmovdqa $CTR3, $STATE3 |
| vmovdqa $CTR4, $STATE4 |
| |
| vpxor ($KS), $STATE1, $STATE1 |
| vpxor ($KS), $STATE2, $STATE2 |
| vpxor ($KS), $STATE3, $STATE3 |
| vpxor ($KS), $STATE4, $STATE4 |
| |
| ${\$aes_round->(1)} |
| vpaddd $ADDER, $CTR1, $CTR1 |
| ${\$aes_round->(2)} |
| vpaddd $ADDER, $CTR2, $CTR2 |
| ${\$aes_round->(3)} |
| vpaddd $ADDER, $CTR3, $CTR3 |
| ${\$aes_round->(4)} |
| vpaddd $ADDER, $CTR4, $CTR4 |
| |
| ${\$aes_round->(5)} |
| ${\$aes_round->(6)} |
| ${\$aes_round->(7)} |
| ${\$aes_round->(8)} |
| ${\$aes_round->(9)} |
| ${\$aes_round->(10)} |
| ${\$aes_round->(11)} |
| ${\$aes_round->(12)} |
| ${\$aes_round->(13)} |
| ${\$aes_lastround->(14)} |
| |
| # XOR with Plaintext |
| vpxor 0*16($PT), $STATE1, $STATE1 |
| vpxor 1*16($PT), $STATE2, $STATE2 |
| vpxor 2*16($PT), $STATE3, $STATE3 |
| vpxor 3*16($PT), $STATE4, $STATE4 |
| |
| subq \$1, $LEN |
| |
| vmovdqu $STATE1, 0*16($CT) |
| vmovdqu $STATE2, 1*16($CT) |
| vmovdqu $STATE3, 2*16($CT) |
| vmovdqu $STATE4, 3*16($CT) |
| |
| jne .L256_enc_msg_x4_loop1 |
| |
| addq \$64, $CT |
| addq \$64, $PT |
| |
| .L256_enc_msg_x4_check_remainder: |
| cmpq \$0, %r10 |
| je .L256_enc_msg_x4_out |
| |
| .L256_enc_msg_x4_loop2: |
| # encrypt each block separately |
| # CTR1 is the highest counter (even if no LOOP done) |
| |
| vmovdqa $CTR1, $STATE1 |
| vpaddd one(%rip), $CTR1, $CTR1 # inc counter |
| vpxor ($KS), $STATE1, $STATE1 |
| vaesenc 16($KS), $STATE1, $STATE1 |
| vaesenc 32($KS), $STATE1, $STATE1 |
| vaesenc 48($KS), $STATE1, $STATE1 |
| vaesenc 64($KS), $STATE1, $STATE1 |
| vaesenc 80($KS), $STATE1, $STATE1 |
| vaesenc 96($KS), $STATE1, $STATE1 |
| vaesenc 112($KS), $STATE1, $STATE1 |
| vaesenc 128($KS), $STATE1, $STATE1 |
| vaesenc 144($KS), $STATE1, $STATE1 |
| vaesenc 160($KS), $STATE1, $STATE1 |
| vaesenc 176($KS), $STATE1, $STATE1 |
| vaesenc 192($KS), $STATE1, $STATE1 |
| vaesenc 208($KS), $STATE1, $STATE1 |
| vaesenclast 224($KS), $STATE1, $STATE1 |
| |
| # XOR with Plaintext |
| vpxor ($PT), $STATE1, $STATE1 |
| |
| vmovdqu $STATE1, ($CT) |
| |
| addq \$16, $PT |
| addq \$16, $CT |
| |
| subq \$1, %r10 |
| jne .L256_enc_msg_x4_loop2 |
| |
| .L256_enc_msg_x4_out: |
| ret |
| .cfi_endproc |
| .size aes256gcmsiv_enc_msg_x4,.-aes256gcmsiv_enc_msg_x4 |
| ___ |
| } |
| aes256gcmsiv_enc_msg_x4(); |
| |
| sub aes256gcmsiv_enc_msg_x8() { |
| my $STATE1 = "%xmm1"; |
| my $STATE2 = "%xmm2"; |
| my $STATE3 = "%xmm3"; |
| my $STATE4 = "%xmm4"; |
| my $STATE5 = "%xmm5"; |
| my $STATE6 = "%xmm6"; |
| my $STATE7 = "%xmm7"; |
| my $STATE8 = "%xmm8"; |
| my $CTR1 = "%xmm0"; |
| my $CTR2 = "%xmm9"; |
| my $CTR3 = "%xmm10"; |
| my $CTR4 = "%xmm11"; |
| my $CTR5 = "%xmm12"; |
| my $CTR6 = "%xmm13"; |
| my $CTR7 = "%xmm14"; |
| my $TMP1 = "%xmm1"; |
| my $TMP2 = "%xmm2"; |
| my $KS = "%rcx"; |
| my $LEN = "%r8"; |
| my $PT = "%rdi"; |
| my $CT = "%rsi"; |
| my $TAG = "%rdx"; |
| my $SCHED = "%xmm15"; |
| |
| my $aes_round8 = sub { |
| my ($i) = @_; |
| return <<___; |
| vmovdqu ${\eval($i*16)}($KS), $SCHED |
| vaesenc $SCHED, $STATE1, $STATE1 |
| vaesenc $SCHED, $STATE2, $STATE2 |
| vaesenc $SCHED, $STATE3, $STATE3 |
| vaesenc $SCHED, $STATE4, $STATE4 |
| vaesenc $SCHED, $STATE5, $STATE5 |
| vaesenc $SCHED, $STATE6, $STATE6 |
| vaesenc $SCHED, $STATE7, $STATE7 |
| vaesenc $SCHED, $STATE8, $STATE8 |
| ___ |
| }; |
| |
| my $aes_lastround8 = sub { |
| my ($i) = @_; |
| return <<___; |
| vmovdqu ${\eval($i*16)}($KS), $SCHED |
| vaesenclast $SCHED, $STATE1, $STATE1 |
| vaesenclast $SCHED, $STATE2, $STATE2 |
| vaesenclast $SCHED, $STATE3, $STATE3 |
| vaesenclast $SCHED, $STATE4, $STATE4 |
| vaesenclast $SCHED, $STATE5, $STATE5 |
| vaesenclast $SCHED, $STATE6, $STATE6 |
| vaesenclast $SCHED, $STATE7, $STATE7 |
| vaesenclast $SCHED, $STATE8, $STATE8 |
| ___ |
| }; |
| |
| # void ENC_MSG_x8(unsigned char* PT, |
| # unsigned char* CT, |
| # unsigned char* TAG, |
| # unsigned char* KS, |
| # size_t byte_len); |
| # parameter 1: %rdi #PT |
| # parameter 2: %rsi #CT |
| # parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32] |
| # parameter 4: %rcx #KS |
| # parameter 5: %r8 #LEN MSG_length in bytes |
| $code.=<<___; |
| .globl aes256gcmsiv_enc_msg_x8 |
| .type aes256gcmsiv_enc_msg_x8,\@function,5 |
| .align 16 |
| aes256gcmsiv_enc_msg_x8: |
| .cfi_startproc |
| test $LEN, $LEN |
| jnz .L256_enc_msg_x8_start |
| ret |
| |
| .L256_enc_msg_x8_start: |
| # Place in stack |
| movq %rsp, %r11 |
| subq \$16, %r11 |
| andq \$-64, %r11 |
| |
| movq $LEN, %r10 |
| shrq \$4, $LEN # LEN = num of blocks |
| shlq \$60, %r10 |
| jz .L256_enc_msg_x8_start2 |
| addq \$1, $LEN |
| |
| .L256_enc_msg_x8_start2: |
| movq $LEN, %r10 |
| shlq \$61, %r10 |
| shrq \$61, %r10 |
| |
| # Make IV from TAG |
| vmovdqa ($TAG), $TMP1 |
| vpor OR_MASK(%rip), $TMP1, $TMP1 # TMP1= IV = [1]TAG[126...32][00..00] |
| |
| # store counter8 on the stack |
| vpaddd seven(%rip), $TMP1, $CTR1 |
| vmovdqa $CTR1, (%r11) # CTR8 = TAG[127...32][00..07] |
| vpaddd one(%rip), $TMP1, $CTR2 # CTR2 = TAG[127...32][00..01] |
| vpaddd two(%rip), $TMP1, $CTR3 # CTR3 = TAG[127...32][00..02] |
| vpaddd three(%rip), $TMP1, $CTR4 # CTR4 = TAG[127...32][00..03] |
| vpaddd four(%rip), $TMP1, $CTR5 # CTR5 = TAG[127...32][00..04] |
| vpaddd five(%rip), $TMP1, $CTR6 # CTR6 = TAG[127...32][00..05] |
| vpaddd six(%rip), $TMP1, $CTR7 # CTR7 = TAG[127...32][00..06] |
| vmovdqa $TMP1, $CTR1 # CTR1 = TAG[127...32][00..00] |
| |
| shrq \$3, $LEN |
| jz .L256_enc_msg_x8_check_remainder |
| |
| subq \$128, $CT |
| subq \$128, $PT |
| |
| .L256_enc_msg_x8_loop1: |
| addq \$128, $CT |
| addq \$128, $PT |
| |
| vmovdqa $CTR1, $STATE1 |
| vmovdqa $CTR2, $STATE2 |
| vmovdqa $CTR3, $STATE3 |
| vmovdqa $CTR4, $STATE4 |
| vmovdqa $CTR5, $STATE5 |
| vmovdqa $CTR6, $STATE6 |
| vmovdqa $CTR7, $STATE7 |
| # move from stack |
| vmovdqa (%r11), $STATE8 |
| |
| vpxor ($KS), $STATE1, $STATE1 |
| vpxor ($KS), $STATE2, $STATE2 |
| vpxor ($KS), $STATE3, $STATE3 |
| vpxor ($KS), $STATE4, $STATE4 |
| vpxor ($KS), $STATE5, $STATE5 |
| vpxor ($KS), $STATE6, $STATE6 |
| vpxor ($KS), $STATE7, $STATE7 |
| vpxor ($KS), $STATE8, $STATE8 |
| |
| ${\$aes_round8->(1)} |
| vmovdqa (%r11), $CTR7 # deal with CTR8 |
| vpaddd eight(%rip), $CTR7, $CTR7 |
| vmovdqa $CTR7, (%r11) |
| ${\$aes_round8->(2)} |
| vpsubd one(%rip), $CTR7, $CTR7 |
| ${\$aes_round8->(3)} |
| vpaddd eight(%rip), $CTR1, $CTR1 |
| ${\$aes_round8->(4)} |
| vpaddd eight(%rip), $CTR2, $CTR2 |
| ${\$aes_round8->(5)} |
| vpaddd eight(%rip), $CTR3, $CTR3 |
| ${\$aes_round8->(6)} |
| vpaddd eight(%rip), $CTR4, $CTR4 |
| ${\$aes_round8->(7)} |
| vpaddd eight(%rip), $CTR5, $CTR5 |
| ${\$aes_round8->(8)} |
| vpaddd eight(%rip), $CTR6, $CTR6 |
| ${\$aes_round8->(9)} |
| ${\$aes_round8->(10)} |
| ${\$aes_round8->(11)} |
| ${\$aes_round8->(12)} |
| ${\$aes_round8->(13)} |
| ${\$aes_lastround8->(14)} |
| |
| # XOR with Plaintext |
| vpxor 0*16($PT), $STATE1, $STATE1 |
| vpxor 1*16($PT), $STATE2, $STATE2 |
| vpxor 2*16($PT), $STATE3, $STATE3 |
| vpxor 3*16($PT), $STATE4, $STATE4 |
| vpxor 4*16($PT), $STATE5, $STATE5 |
| vpxor 5*16($PT), $STATE6, $STATE6 |
| vpxor 6*16($PT), $STATE7, $STATE7 |
| vpxor 7*16($PT), $STATE8, $STATE8 |
| |
| subq \$1, $LEN |
| |
| vmovdqu $STATE1, 0*16($CT) |
| vmovdqu $STATE2, 1*16($CT) |
| vmovdqu $STATE3, 2*16($CT) |
| vmovdqu $STATE4, 3*16($CT) |
| vmovdqu $STATE5, 4*16($CT) |
| vmovdqu $STATE6, 5*16($CT) |
| vmovdqu $STATE7, 6*16($CT) |
| vmovdqu $STATE8, 7*16($CT) |
| |
| jne .L256_enc_msg_x8_loop1 |
| |
| addq \$128, $CT |
| addq \$128, $PT |
| |
| .L256_enc_msg_x8_check_remainder: |
| cmpq \$0, %r10 |
| je .L256_enc_msg_x8_out |
| |
| .L256_enc_msg_x8_loop2: |
| # encrypt each block separately |
| # CTR1 is the highest counter (even if no LOOP done) |
| vmovdqa $CTR1, $STATE1 |
| vpaddd one(%rip), $CTR1, $CTR1 |
| |
| vpxor ($KS), $STATE1, $STATE1 |
| vaesenc 16($KS), $STATE1, $STATE1 |
| vaesenc 32($KS), $STATE1, $STATE1 |
| vaesenc 48($KS), $STATE1, $STATE1 |
| vaesenc 64($KS), $STATE1, $STATE1 |
| vaesenc 80($KS), $STATE1, $STATE1 |
| vaesenc 96($KS), $STATE1, $STATE1 |
| vaesenc 112($KS), $STATE1, $STATE1 |
| vaesenc 128($KS), $STATE1, $STATE1 |
| vaesenc 144($KS), $STATE1, $STATE1 |
| vaesenc 160($KS), $STATE1, $STATE1 |
| vaesenc 176($KS), $STATE1, $STATE1 |
| vaesenc 192($KS), $STATE1, $STATE1 |
| vaesenc 208($KS), $STATE1, $STATE1 |
| vaesenclast 224($KS), $STATE1, $STATE1 |
| |
| # XOR with Plaintext |
| vpxor ($PT), $STATE1, $STATE1 |
| |
| vmovdqu $STATE1, ($CT) |
| |
| addq \$16, $PT |
| addq \$16, $CT |
| subq \$1, %r10 |
| jnz .L256_enc_msg_x8_loop2 |
| |
| .L256_enc_msg_x8_out: |
| ret |
| |
| .cfi_endproc |
| .size aes256gcmsiv_enc_msg_x8,.-aes256gcmsiv_enc_msg_x8 |
| ___ |
| } |
| aes256gcmsiv_enc_msg_x8(); |
| aesgcmsiv_dec(1); |
| |
| sub aes256gcmsiv_kdf { |
| my $ONE = "%xmm8"; |
| my $BLOCK1 = "%xmm4"; |
| my $BLOCK2 = "%xmm6"; |
| my $BLOCK3 = "%xmm7"; |
| my $BLOCK4 = "%xmm11"; |
| my $BLOCK5 = "%xmm12"; |
| my $BLOCK6 = "%xmm13"; |
| |
| my $enc_roundx6 = sub { |
| my ($i, $j) = @_; |
| return <<___; |
| vmovdqa ${\eval($i*16)}(%rdx), $j |
| vaesenc $j, $BLOCK1, $BLOCK1 |
| vaesenc $j, $BLOCK2, $BLOCK2 |
| vaesenc $j, $BLOCK3, $BLOCK3 |
| vaesenc $j, $BLOCK4, $BLOCK4 |
| vaesenc $j, $BLOCK5, $BLOCK5 |
| vaesenc $j, $BLOCK6, $BLOCK6 |
| ___ |
| }; |
| |
| my $enc_roundlastx6 = sub { |
| my ($i, $j) = @_; |
| return <<___; |
| vmovdqa ${\eval($i*16)}(%rdx), $j |
| vaesenclast $j, $BLOCK1, $BLOCK1 |
| vaesenclast $j, $BLOCK2, $BLOCK2 |
| vaesenclast $j, $BLOCK3, $BLOCK3 |
| vaesenclast $j, $BLOCK4, $BLOCK4 |
| vaesenclast $j, $BLOCK5, $BLOCK5 |
| vaesenclast $j, $BLOCK6, $BLOCK6 |
| ___ |
| }; |
| |
| # void aes256gcmsiv_kdf(const uint8_t nonce[16], |
| # uint8_t *out_key_material, |
| # const uint8_t *key_schedule); |
| $code.=<<___; |
| .globl aes256gcmsiv_kdf |
| .type aes256gcmsiv_kdf,\@function,3 |
| .align 16 |
| aes256gcmsiv_kdf: |
| .cfi_startproc |
| # parameter 1: %rdi Pointer to NONCE |
| # parameter 2: %rsi Pointer to CT |
| # parameter 4: %rdx Pointer to keys |
| |
| vmovdqa (%rdx), %xmm1 # xmm1 = first 16 bytes of random key |
| vmovdqa 0*16(%rdi), $BLOCK1 |
| vmovdqa and_mask(%rip), $BLOCK4 |
| vmovdqa one(%rip), $ONE |
| vpshufd \$0x90, $BLOCK1, $BLOCK1 |
| vpand $BLOCK4, $BLOCK1, $BLOCK1 |
| vpaddd $ONE, $BLOCK1, $BLOCK2 |
| vpaddd $ONE, $BLOCK2, $BLOCK3 |
| vpaddd $ONE, $BLOCK3, $BLOCK4 |
| vpaddd $ONE, $BLOCK4, $BLOCK5 |
| vpaddd $ONE, $BLOCK5, $BLOCK6 |
| |
| vpxor %xmm1, $BLOCK1, $BLOCK1 |
| vpxor %xmm1, $BLOCK2, $BLOCK2 |
| vpxor %xmm1, $BLOCK3, $BLOCK3 |
| vpxor %xmm1, $BLOCK4, $BLOCK4 |
| vpxor %xmm1, $BLOCK5, $BLOCK5 |
| vpxor %xmm1, $BLOCK6, $BLOCK6 |
| |
| ${\$enc_roundx6->(1, "%xmm1")} |
| ${\$enc_roundx6->(2, "%xmm2")} |
| ${\$enc_roundx6->(3, "%xmm1")} |
| ${\$enc_roundx6->(4, "%xmm2")} |
| ${\$enc_roundx6->(5, "%xmm1")} |
| ${\$enc_roundx6->(6, "%xmm2")} |
| ${\$enc_roundx6->(7, "%xmm1")} |
| ${\$enc_roundx6->(8, "%xmm2")} |
| ${\$enc_roundx6->(9, "%xmm1")} |
| ${\$enc_roundx6->(10, "%xmm2")} |
| ${\$enc_roundx6->(11, "%xmm1")} |
| ${\$enc_roundx6->(12, "%xmm2")} |
| ${\$enc_roundx6->(13, "%xmm1")} |
| ${\$enc_roundlastx6->(14, "%xmm2")} |
| |
| vmovdqa $BLOCK1, 0*16(%rsi) |
| vmovdqa $BLOCK2, 1*16(%rsi) |
| vmovdqa $BLOCK3, 2*16(%rsi) |
| vmovdqa $BLOCK4, 3*16(%rsi) |
| vmovdqa $BLOCK5, 4*16(%rsi) |
| vmovdqa $BLOCK6, 5*16(%rsi) |
| ret |
| .cfi_endproc |
| .size aes256gcmsiv_kdf, .-aes256gcmsiv_kdf |
| ___ |
| } |
| aes256gcmsiv_kdf(); |
| |
| print $code; |
| |
| close STDOUT or die "error closing STDOUT"; |