blob: 1a3d064e03c3b1e4772d2b6e82200f61547c63cf [file] [log] [blame]
#!/usr/bin/env perl
# Copyright (c) 2017, Shay Gueron.
# Copyright (c) 2017, Google Inc.
#
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
use warnings FATAL => 'all';
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
$code.=<<___;
.data
.align 16
one:
.quad 1,0
two:
.quad 2,0
three:
.quad 3,0
four:
.quad 4,0
five:
.quad 5,0
six:
.quad 6,0
seven:
.quad 7,0
eight:
.quad 8,0
OR_MASK:
.long 0x00000000,0x00000000,0x00000000,0x80000000
poly:
.quad 0x1, 0xc200000000000000
mask:
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
con1:
.long 1,1,1,1
con2:
.long 0x1b,0x1b,0x1b,0x1b
con3:
.byte -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7
and_mask:
.long 0,0xffffffff, 0xffffffff, 0xffffffff
___
$code.=<<___;
.text
___
sub gfmul {
#########################
# a = T
# b = TMP0 - remains unchanged
# res = T
# uses also TMP1,TMP2,TMP3,TMP4
# __m128i GFMUL(__m128i A, __m128i B);
my $T = "%xmm0";
my $TMP0 = "%xmm1";
my $TMP1 = "%xmm2";
my $TMP2 = "%xmm3";
my $TMP3 = "%xmm4";
my $TMP4 = "%xmm5";
$code.=<<___;
.type GFMUL,\@abi-omnipotent
.align 16
GFMUL:
.cfi_startproc
vpclmulqdq \$0x00, $TMP0, $T, $TMP1
vpclmulqdq \$0x11, $TMP0, $T, $TMP4
vpclmulqdq \$0x10, $TMP0, $T, $TMP2
vpclmulqdq \$0x01, $TMP0, $T, $TMP3
vpxor $TMP3, $TMP2, $TMP2
vpslldq \$8, $TMP2, $TMP3
vpsrldq \$8, $TMP2, $TMP2
vpxor $TMP3, $TMP1, $TMP1
vpxor $TMP2, $TMP4, $TMP4
vpclmulqdq \$0x10, poly(%rip), $TMP1, $TMP2
vpshufd \$78, $TMP1, $TMP3
vpxor $TMP3, $TMP2, $TMP1
vpclmulqdq \$0x10, poly(%rip), $TMP1, $TMP2
vpshufd \$78, $TMP1, $TMP3
vpxor $TMP3, $TMP2, $TMP1
vpxor $TMP4, $TMP1, $T
ret
.cfi_endproc
.size GFMUL, .-GFMUL
___
}
gfmul();
sub aesgcmsiv_htable_init {
# aesgcmsiv_htable_init writes an eight-entry table of powers of |H| to
# |out_htable|.
# void aesgcmsiv_htable_init(uint8_t out_htable[16*8], uint8_t *H);
my $Htbl = "%rdi";
my $H = "%rsi";
my $T = "%xmm0";
my $TMP0 = "%xmm1";
$code.=<<___;
.globl aesgcmsiv_htable_init
.type aesgcmsiv_htable_init,\@function,2
.align 16
aesgcmsiv_htable_init:
.cfi_startproc
vmovdqa ($H), $T
vmovdqa $T, $TMP0
vmovdqa $T, ($Htbl) # H
call GFMUL
vmovdqa $T, 16($Htbl) # H^2
call GFMUL
vmovdqa $T, 32($Htbl) # H^3
call GFMUL
vmovdqa $T, 48($Htbl) # H^4
call GFMUL
vmovdqa $T, 64($Htbl) # H^5
call GFMUL
vmovdqa $T, 80($Htbl) # H^6
call GFMUL
vmovdqa $T, 96($Htbl) # H^7
call GFMUL
vmovdqa $T, 112($Htbl) # H^8
ret
.cfi_endproc
.size aesgcmsiv_htable_init, .-aesgcmsiv_htable_init
___
}
aesgcmsiv_htable_init();
sub aesgcmsiv_htable6_init {
# aesgcmsiv_htable6_init writes a six-entry table of powers of |H| to
# |out_htable|.
# void aesgcmsiv_htable6_init(uint8_t out_htable[16*6], uint8_t *H);
#
my $Htbl = "%rdi";
my $H = "%rsi";
my $T = "%xmm0";
my $TMP0 = "%xmm1";
$code.=<<___;
.globl aesgcmsiv_htable6_init
.type aesgcmsiv_htable6_init,\@function,2
.align 16
aesgcmsiv_htable6_init:
.cfi_startproc
vmovdqa ($H), $T
vmovdqa $T, $TMP0
vmovdqa $T, ($Htbl) # H
call GFMUL
vmovdqa $T, 16($Htbl) # H^2
call GFMUL
vmovdqa $T, 32($Htbl) # H^3
call GFMUL
vmovdqa $T, 48($Htbl) # H^4
call GFMUL
vmovdqa $T, 64($Htbl) # H^5
call GFMUL
vmovdqa $T, 80($Htbl) # H^6
ret
.cfi_endproc
.size aesgcmsiv_htable6_init, .-aesgcmsiv_htable6_init
___
}
aesgcmsiv_htable6_init();
sub aesgcmsiv_htable_polyval {
# void aesgcmsiv_htable_polyval(uint8_t Htbl[16*8], uint8_t *MSG, uint64_t LEN, uint8_t *T);
# parameter 1: %rdi Htable - pointer to Htable
# parameter 2: %rsi INp - pointer to input
# parameter 3: %rdx LEN - length of BUFFER in bytes
# parameter 4: %rcx T - pointer to POLYVAL output
my $DATA = "%xmm0";
my $hlp0 = "%r11";
my $Htbl = "%rdi";
my $inp = "%rsi";
my $len = "%rdx";
my $TMP0 = "%xmm3";
my $TMP1 = "%xmm4";
my $TMP2 = "%xmm5";
my $TMP3 = "%xmm6";
my $TMP4 = "%xmm7";
my $Tp = "%rcx";
my $T = "%xmm1";
my $Xhi = "%xmm9";
my $SCHOOLBOOK_AAD = sub {
my ($i)=@_;
return <<___;
vpclmulqdq \$0x01, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
vpxor $TMP3, $TMP2, $TMP2
vpclmulqdq \$0x00, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
vpxor $TMP3, $TMP0, $TMP0
vpclmulqdq \$0x11, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
vpxor $TMP3, $TMP1, $TMP1
vpclmulqdq \$0x10, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
vpxor $TMP3, $TMP2, $TMP2
___
};
$code.=<<___;
.globl aesgcmsiv_htable_polyval
.type aesgcmsiv_htable_polyval,\@function,4
.align 16
aesgcmsiv_htable_polyval:
.cfi_startproc
test $len, $len
jnz .Lhtable_polyval_start
ret
.Lhtable_polyval_start:
vzeroall
# We hash 8 blocks each iteration. If the total number of blocks is not a
# multiple of 8, we first hash the leading n%8 blocks.
movq $len, $hlp0
andq \$127, $hlp0
jz .Lhtable_polyval_no_prefix
vpxor $Xhi, $Xhi, $Xhi
vmovdqa ($Tp), $T
sub $hlp0, $len
sub \$16, $hlp0
# hash first prefix block
vmovdqu ($inp), $DATA
vpxor $T, $DATA, $DATA
vpclmulqdq \$0x01, ($Htbl,$hlp0), $DATA, $TMP2
vpclmulqdq \$0x00, ($Htbl,$hlp0), $DATA, $TMP0
vpclmulqdq \$0x11, ($Htbl,$hlp0), $DATA, $TMP1
vpclmulqdq \$0x10, ($Htbl,$hlp0), $DATA, $TMP3
vpxor $TMP3, $TMP2, $TMP2
lea 16($inp), $inp
test $hlp0, $hlp0
jnz .Lhtable_polyval_prefix_loop
jmp .Lhtable_polyval_prefix_complete
# hash remaining prefix bocks (up to 7 total prefix blocks)
.align 64
.Lhtable_polyval_prefix_loop:
sub \$16, $hlp0
vmovdqu ($inp), $DATA # next data block
vpclmulqdq \$0x00, ($Htbl,$hlp0), $DATA, $TMP3
vpxor $TMP3, $TMP0, $TMP0
vpclmulqdq \$0x11, ($Htbl,$hlp0), $DATA, $TMP3
vpxor $TMP3, $TMP1, $TMP1
vpclmulqdq \$0x01, ($Htbl,$hlp0), $DATA, $TMP3
vpxor $TMP3, $TMP2, $TMP2
vpclmulqdq \$0x10, ($Htbl,$hlp0), $DATA, $TMP3
vpxor $TMP3, $TMP2, $TMP2
test $hlp0, $hlp0
lea 16($inp), $inp
jnz .Lhtable_polyval_prefix_loop
.Lhtable_polyval_prefix_complete:
vpsrldq \$8, $TMP2, $TMP3
vpslldq \$8, $TMP2, $TMP2
vpxor $TMP3, $TMP1, $Xhi
vpxor $TMP2, $TMP0, $T
jmp .Lhtable_polyval_main_loop
.Lhtable_polyval_no_prefix:
# At this point we know the number of blocks is a multiple of 8. However,
# the reduction in the main loop includes a multiplication by x^(-128). In
# order to counter this, the existing tag needs to be multipled by x^128.
# In practice, this just means that it is loaded into $Xhi, not $T.
vpxor $T, $T, $T
vmovdqa ($Tp), $Xhi
.align 64
.Lhtable_polyval_main_loop:
sub \$0x80, $len
jb .Lhtable_polyval_out
vmovdqu 16*7($inp), $DATA # Ii
vpclmulqdq \$0x01, ($Htbl), $DATA, $TMP2
vpclmulqdq \$0x00, ($Htbl), $DATA, $TMP0
vpclmulqdq \$0x11, ($Htbl), $DATA, $TMP1
vpclmulqdq \$0x10, ($Htbl), $DATA, $TMP3
vpxor $TMP3, $TMP2, $TMP2
#########################################################
vmovdqu 16*6($inp), $DATA
${\$SCHOOLBOOK_AAD->(1)}
#########################################################
vmovdqu 16*5($inp), $DATA
vpclmulqdq \$0x10, poly(%rip), $T, $TMP4 # reduction stage 1a
vpalignr \$8, $T, $T, $T
${\$SCHOOLBOOK_AAD->(2)}
vpxor $TMP4, $T, $T # reduction stage 1b
#########################################################
vmovdqu 16*4($inp), $DATA
${\$SCHOOLBOOK_AAD->(3)}
#########################################################
vmovdqu 16*3($inp), $DATA
vpclmulqdq \$0x10, poly(%rip), $T, $TMP4 # reduction stage 2a
vpalignr \$8, $T, $T, $T
${\$SCHOOLBOOK_AAD->(4)}
vpxor $TMP4, $T, $T # reduction stage 2b
#########################################################
vmovdqu 16*2($inp), $DATA
${\$SCHOOLBOOK_AAD->(5)}
vpxor $Xhi, $T, $T # reduction finalize
#########################################################
vmovdqu 16*1($inp), $DATA
${\$SCHOOLBOOK_AAD->(6)}
#########################################################
vmovdqu 16*0($inp), $DATA
vpxor $T, $DATA, $DATA
${\$SCHOOLBOOK_AAD->(7)}
#########################################################
vpsrldq \$8, $TMP2, $TMP3
vpslldq \$8, $TMP2, $TMP2
vpxor $TMP3, $TMP1, $Xhi
vpxor $TMP2, $TMP0, $T
lea 16*8($inp), $inp
jmp .Lhtable_polyval_main_loop
#########################################################
.Lhtable_polyval_out:
vpclmulqdq \$0x10, poly(%rip), $T, $TMP3
vpalignr \$8, $T, $T, $T
vpxor $TMP3, $T, $T
vpclmulqdq \$0x10, poly(%rip), $T, $TMP3
vpalignr \$8, $T, $T, $T
vpxor $TMP3, $T, $T
vpxor $Xhi, $T, $T
vmovdqu $T, ($Tp)
vzeroupper
ret
.cfi_endproc
.size aesgcmsiv_htable_polyval,.-aesgcmsiv_htable_polyval
___
}
aesgcmsiv_htable_polyval();
sub aesgcmsiv_polyval_horner {
#void aesgcmsiv_polyval_horner(unsigned char T[16], // output
# const unsigned char* H, // H
# unsigned char* BUF, // Buffer
# unsigned int blocks); // Len2
#
# parameter 1: %rdi T - pointers to POLYVAL output
# parameter 2: %rsi Hp - pointer to H (user key)
# parameter 3: %rdx INp - pointer to input
# parameter 4: %rcx L - total number of blocks in input BUFFER
#
my $T = "%rdi";
my $Hp = "%rsi";
my $INp = "%rdx";
my $L = "%rcx";
my $LOC = "%r10";
my $LEN = "%eax";
my $H = "%xmm1";
my $RES = "%xmm0";
$code.=<<___;
.globl aesgcmsiv_polyval_horner
.type aesgcmsiv_polyval_horner,\@function,4
.align 16
aesgcmsiv_polyval_horner:
.cfi_startproc
test $L, $L
jnz .Lpolyval_horner_start
ret
.Lpolyval_horner_start:
# We will start with L GFMULS for POLYVAL(BIG_BUFFER)
# RES = GFMUL(RES, H)
xorq $LOC, $LOC
shlq \$4, $L # L contains number of bytes to process
vmovdqa ($Hp), $H
vmovdqa ($T), $RES
.Lpolyval_horner_loop:
vpxor ($INp,$LOC), $RES, $RES # RES = RES + Xi
call GFMUL # RES = RES * H
add \$16, $LOC
cmp $LOC, $L
jne .Lpolyval_horner_loop
# calculation of T is complete. RES=T
vmovdqa $RES, ($T)
ret
.cfi_endproc
.size aesgcmsiv_polyval_horner,.-aesgcmsiv_polyval_horner
___
}
aesgcmsiv_polyval_horner();
# void aes128gcmsiv_aes_ks(const uint8_t *key, uint8_t *out_expanded_key);
# parameter 1: %rdi
# parameter 2: %rsi
$code.=<<___;
.globl aes128gcmsiv_aes_ks
.type aes128gcmsiv_aes_ks,\@function,2
.align 16
aes128gcmsiv_aes_ks:
.cfi_startproc
vmovdqu (%rdi), %xmm1 # xmm1 = user key
vmovdqa %xmm1, (%rsi) # rsi points to output
vmovdqa con1(%rip), %xmm0
vmovdqa mask(%rip), %xmm15
movq \$8, %rax
.Lks128_loop:
addq \$16, %rsi # rsi points for next key
subq \$1, %rax
vpshufb %xmm15, %xmm1, %xmm2 # xmm2 = shuffled user key
vaesenclast %xmm0, %xmm2, %xmm2
vpslld \$1, %xmm0, %xmm0
vpslldq \$4, %xmm1, %xmm3
vpxor %xmm3, %xmm1, %xmm1
vpslldq \$4, %xmm3, %xmm3
vpxor %xmm3, %xmm1, %xmm1
vpslldq \$4, %xmm3, %xmm3
vpxor %xmm3, %xmm1, %xmm1
vpxor %xmm2, %xmm1, %xmm1
vmovdqa %xmm1, (%rsi)
jne .Lks128_loop
vmovdqa con2(%rip), %xmm0
vpshufb %xmm15, %xmm1, %xmm2
vaesenclast %xmm0, %xmm2, %xmm2
vpslld \$1, %xmm0, %xmm0
vpslldq \$4, %xmm1, %xmm3
vpxor %xmm3, %xmm1, %xmm1
vpslldq \$4, %xmm3, %xmm3
vpxor %xmm3, %xmm1, %xmm1
vpslldq \$4, %xmm3, %xmm3
vpxor %xmm3, %xmm1, %xmm1
vpxor %xmm2, %xmm1, %xmm1
vmovdqa %xmm1, 16(%rsi)
vpshufb %xmm15, %xmm1, %xmm2
vaesenclast %xmm0, %xmm2, %xmm2
vpslldq \$4, %xmm1, %xmm3
vpxor %xmm3, %xmm1, %xmm1
vpslldq \$4, %xmm3, %xmm3
vpxor %xmm3, %xmm1, %xmm1
vpslldq \$4, %xmm3, %xmm3
vpxor %xmm3, %xmm1, %xmm1
vpxor %xmm2, %xmm1, %xmm1
vmovdqa %xmm1, 32(%rsi)
ret
.cfi_endproc
.size aes128gcmsiv_aes_ks,.-aes128gcmsiv_aes_ks
___
# void aes256gcmsiv_aes_ks(const uint8_t *key, uint8_t *out_expanded_key);
# parameter 1: %rdi
# parameter 2: %rsi
$code.=<<___;
.globl aes256gcmsiv_aes_ks
.type aes256gcmsiv_aes_ks,\@function,2
.align 16
aes256gcmsiv_aes_ks:
.cfi_startproc
vmovdqu (%rdi), %xmm1
vmovdqu 16(%rdi), %xmm3
vmovdqa %xmm1, (%rsi)
vmovdqa %xmm3, 16(%rsi)
vmovdqa con1(%rip), %xmm0
vmovdqa mask(%rip), %xmm15
vpxor %xmm14, %xmm14, %xmm14
mov \$6, %rax
.Lks256_loop:
add \$32, %rsi
subq \$1, %rax
vpshufb %xmm15, %xmm3, %xmm2
vaesenclast %xmm0, %xmm2, %xmm2
vpslld \$1, %xmm0, %xmm0
vpsllq \$32, %xmm1, %xmm4
vpxor %xmm4, %xmm1, %xmm1
vpshufb con3(%rip), %xmm1, %xmm4
vpxor %xmm4, %xmm1, %xmm1
vpxor %xmm2, %xmm1, %xmm1
vmovdqa %xmm1, (%rsi)
vpshufd \$0xff, %xmm1, %xmm2
vaesenclast %xmm14, %xmm2, %xmm2
vpsllq \$32, %xmm3, %xmm4
vpxor %xmm4, %xmm3, %xmm3
vpshufb con3(%rip), %xmm3, %xmm4
vpxor %xmm4, %xmm3, %xmm3
vpxor %xmm2, %xmm3, %xmm3
vmovdqa %xmm3, 16(%rsi)
jne .Lks256_loop
vpshufb %xmm15, %xmm3, %xmm2
vaesenclast %xmm0, %xmm2, %xmm2
vpsllq \$32, %xmm1, %xmm4
vpxor %xmm4, %xmm1, %xmm1
vpshufb con3(%rip), %xmm1, %xmm4
vpxor %xmm4, %xmm1, %xmm1
vpxor %xmm2, %xmm1, %xmm1
vmovdqa %xmm1, 32(%rsi)
ret
.cfi_endproc
___
sub aes128gcmsiv_aes_ks_enc_x1 {
my $KS1_REGA = "%xmm1";
my $KS1_REGB = "%xmm2";
my $BLOCK1 = "%xmm4";
my $AUXREG = "%xmm3";
my $KS_BLOCK = sub {
my ($reg, $reg2, $auxReg) = @_;
return <<___;
vpsllq \$32, $reg, $auxReg #!!saving mov instruction to xmm3
vpxor $auxReg, $reg, $reg
vpshufb con3(%rip), $reg, $auxReg
vpxor $auxReg, $reg, $reg
vpxor $reg2, $reg, $reg
___
};
my $round = sub {
my ($i, $j) = @_;
return <<___;
vpshufb %xmm15, %xmm1, %xmm2 #!!saving mov instruction to xmm2
vaesenclast %xmm0, %xmm2, %xmm2
vpslld \$1, %xmm0, %xmm0
${\$KS_BLOCK->($KS1_REGA, $KS1_REGB, $AUXREG)}
vaesenc %xmm1, $BLOCK1, $BLOCK1
vmovdqa %xmm1, ${\eval(16*$i)}($j)
___
};
my $roundlast = sub {
my ($i, $j) = @_;
return <<___;
vpshufb %xmm15, %xmm1, %xmm2 #!!saving mov instruction to xmm2
vaesenclast %xmm0, %xmm2, %xmm2
${\$KS_BLOCK->($KS1_REGA, $KS1_REGB, $AUXREG)}
vaesenclast %xmm1, $BLOCK1, $BLOCK1
vmovdqa %xmm1, ${\eval(16*$i)}($j)
___
};
# parameter 1: %rdi Pointer to PT
# parameter 2: %rsi Pointer to CT
# parameter 4: %rdx Pointer to keys
# parameter 5: %rcx Pointer to initial key
$code.=<<___;
.globl aes128gcmsiv_aes_ks_enc_x1
.type aes128gcmsiv_aes_ks_enc_x1,\@function,4
.align 16
aes128gcmsiv_aes_ks_enc_x1:
.cfi_startproc
vmovdqa (%rcx), %xmm1 # xmm1 = first 16 bytes of random key
vmovdqa 0*16(%rdi), $BLOCK1
vmovdqa %xmm1, (%rdx) # KEY[0] = first 16 bytes of random key
vpxor %xmm1, $BLOCK1, $BLOCK1
vmovdqa con1(%rip), %xmm0 # xmm0 = 1,1,1,1
vmovdqa mask(%rip), %xmm15 # xmm15 = mask
${\$round->(1, "%rdx")}
${\$round->(2, "%rdx")}
${\$round->(3, "%rdx")}
${\$round->(4, "%rdx")}
${\$round->(5, "%rdx")}
${\$round->(6, "%rdx")}
${\$round->(7, "%rdx")}
${\$round->(8, "%rdx")}
vmovdqa con2(%rip), %xmm0
${\$round->(9, "%rdx")}
${\$roundlast->(10, "%rdx")}
vmovdqa $BLOCK1, 0*16(%rsi)
ret
.cfi_endproc
.size aes128gcmsiv_aes_ks_enc_x1,.-aes128gcmsiv_aes_ks_enc_x1
___
}
aes128gcmsiv_aes_ks_enc_x1();
sub aes128gcmsiv_kdf {
my $BLOCK1 = "%xmm9";
my $BLOCK2 = "%xmm10";
my $BLOCK3 = "%xmm11";
my $BLOCK4 = "%xmm12";
my $BLOCK5 = "%xmm13";
my $BLOCK6 = "%xmm14";
my $ONE = "%xmm13";
my $KSp = "%rdx";
my $STATE_1 = "%xmm1";
my $enc_roundx4 = sub {
my ($i, $j) = @_;
return <<___;
vmovdqa ${\eval($i*16)}(%rdx), $j
vaesenc $j, $BLOCK1, $BLOCK1
vaesenc $j, $BLOCK2, $BLOCK2
vaesenc $j, $BLOCK3, $BLOCK3
vaesenc $j, $BLOCK4, $BLOCK4
___
};
my $enc_roundlastx4 = sub {
my ($i, $j) = @_;
return <<___;
vmovdqa ${\eval($i*16)}(%rdx), $j
vaesenclast $j, $BLOCK1, $BLOCK1
vaesenclast $j, $BLOCK2, $BLOCK2
vaesenclast $j, $BLOCK3, $BLOCK3
vaesenclast $j, $BLOCK4, $BLOCK4
___
};
# void aes128gcmsiv_kdf(const uint8_t nonce[16],
# uint8_t *out_key_material,
# const uint8_t *key_schedule);
$code.=<<___;
.globl aes128gcmsiv_kdf
.type aes128gcmsiv_kdf,\@function,3
.align 16
aes128gcmsiv_kdf:
.cfi_startproc
# parameter 1: %rdi Pointer to NONCE
# parameter 2: %rsi Pointer to CT
# parameter 4: %rdx Pointer to keys
vmovdqa (%rdx), %xmm1 # xmm1 = first 16 bytes of random key
vmovdqa 0*16(%rdi), $BLOCK1
vmovdqa and_mask(%rip), $BLOCK4
vmovdqa one(%rip), $ONE
vpshufd \$0x90, $BLOCK1, $BLOCK1
vpand $BLOCK4, $BLOCK1, $BLOCK1
vpaddd $ONE, $BLOCK1, $BLOCK2
vpaddd $ONE, $BLOCK2, $BLOCK3
vpaddd $ONE, $BLOCK3, $BLOCK4
vpxor %xmm1, $BLOCK1, $BLOCK1
vpxor %xmm1, $BLOCK2, $BLOCK2
vpxor %xmm1, $BLOCK3, $BLOCK3
vpxor %xmm1, $BLOCK4, $BLOCK4
${\$enc_roundx4->(1, "%xmm1")}
${\$enc_roundx4->(2, "%xmm2")}
${\$enc_roundx4->(3, "%xmm1")}
${\$enc_roundx4->(4, "%xmm2")}
${\$enc_roundx4->(5, "%xmm1")}
${\$enc_roundx4->(6, "%xmm2")}
${\$enc_roundx4->(7, "%xmm1")}
${\$enc_roundx4->(8, "%xmm2")}
${\$enc_roundx4->(9, "%xmm1")}
${\$enc_roundlastx4->(10, "%xmm2")}
vmovdqa $BLOCK1, 0*16(%rsi)
vmovdqa $BLOCK2, 1*16(%rsi)
vmovdqa $BLOCK3, 2*16(%rsi)
vmovdqa $BLOCK4, 3*16(%rsi)
ret
.cfi_endproc
.size aes128gcmsiv_kdf,.-aes128gcmsiv_kdf
___
}
aes128gcmsiv_kdf();
sub aes128gcmsiv_enc_msg_x4 {
my $CTR1 = "%xmm0";
my $CTR2 = "%xmm1";
my $CTR3 = "%xmm2";
my $CTR4 = "%xmm3";
my $ADDER = "%xmm4";
my $STATE1 = "%xmm5";
my $STATE2 = "%xmm6";
my $STATE3 = "%xmm7";
my $STATE4 = "%xmm8";
my $TMP = "%xmm12";
my $TMP2 = "%xmm13";
my $TMP3 = "%xmm14";
my $IV = "%xmm15";
my $PT = "%rdi";
my $CT = "%rsi";
my $TAG = "%rdx";
my $KS = "%rcx";
my $LEN = "%r8";
my $aes_round = sub {
my ($i) = @_;
return <<___;
vmovdqu ${\eval($i*16)}($KS), $TMP
vaesenc $TMP, $STATE1, $STATE1
vaesenc $TMP, $STATE2, $STATE2
vaesenc $TMP, $STATE3, $STATE3
vaesenc $TMP, $STATE4, $STATE4
___
};
my $aes_lastround = sub {
my ($i) = @_;
return <<___;
vmovdqu ${\eval($i*16)}($KS), $TMP
vaesenclast $TMP, $STATE1, $STATE1
vaesenclast $TMP, $STATE2, $STATE2
vaesenclast $TMP, $STATE3, $STATE3
vaesenclast $TMP, $STATE4, $STATE4
___
};
# void aes128gcmsiv_enc_msg_x4(unsigned char* PT, unsigned char* CT,
# unsigned char* TAG, unsigned char* KS,
# size_t byte_len);
# parameter 1: %rdi #PT
# parameter 2: %rsi #CT
# parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32]
# parameter 4: %rcx #KS
# parameter 5: %r8 #LEN MSG_length in bytes
$code.=<<___;
.globl aes128gcmsiv_enc_msg_x4
.type aes128gcmsiv_enc_msg_x4,\@function,5
.align 16
aes128gcmsiv_enc_msg_x4:
.cfi_startproc
test $LEN, $LEN
jnz .L128_enc_msg_x4_start
ret
.L128_enc_msg_x4_start:
pushq %r12
.cfi_push %r12
pushq %r13
.cfi_push %r13
shrq \$4, $LEN # LEN = num of blocks
movq $LEN, %r10
shlq \$62, %r10
shrq \$62, %r10
# make IV from TAG
vmovdqa ($TAG), $IV
vpor OR_MASK(%rip), $IV, $IV #IV = [1]TAG[126...32][00..00]
vmovdqu four(%rip), $ADDER # Register to increment counters
vmovdqa $IV, $CTR1 # CTR1 = TAG[1][127...32][00..00]
vpaddd one(%rip), $IV, $CTR2 # CTR2 = TAG[1][127...32][00..01]
vpaddd two(%rip), $IV, $CTR3 # CTR3 = TAG[1][127...32][00..02]
vpaddd three(%rip), $IV, $CTR4 # CTR4 = TAG[1][127...32][00..03]
shrq \$2, $LEN
je .L128_enc_msg_x4_check_remainder
subq \$64, $CT
subq \$64, $PT
.L128_enc_msg_x4_loop1:
addq \$64, $CT
addq \$64, $PT
vmovdqa $CTR1, $STATE1
vmovdqa $CTR2, $STATE2
vmovdqa $CTR3, $STATE3
vmovdqa $CTR4, $STATE4
vpxor ($KS), $STATE1, $STATE1
vpxor ($KS), $STATE2, $STATE2
vpxor ($KS), $STATE3, $STATE3
vpxor ($KS), $STATE4, $STATE4
${\$aes_round->(1)}
vpaddd $ADDER, $CTR1, $CTR1
${\$aes_round->(2)}
vpaddd $ADDER, $CTR2, $CTR2
${\$aes_round->(3)}
vpaddd $ADDER, $CTR3, $CTR3
${\$aes_round->(4)}
vpaddd $ADDER, $CTR4, $CTR4
${\$aes_round->(5)}
${\$aes_round->(6)}
${\$aes_round->(7)}
${\$aes_round->(8)}
${\$aes_round->(9)}
${\$aes_lastround->(10)}
# XOR with Plaintext
vpxor 0*16($PT), $STATE1, $STATE1
vpxor 1*16($PT), $STATE2, $STATE2
vpxor 2*16($PT), $STATE3, $STATE3
vpxor 3*16($PT), $STATE4, $STATE4
subq \$1, $LEN
vmovdqu $STATE1, 0*16($CT)
vmovdqu $STATE2, 1*16($CT)
vmovdqu $STATE3, 2*16($CT)
vmovdqu $STATE4, 3*16($CT)
jne .L128_enc_msg_x4_loop1
addq \$64,$CT
addq \$64,$PT
.L128_enc_msg_x4_check_remainder:
cmpq \$0, %r10
je .L128_enc_msg_x4_out
.L128_enc_msg_x4_loop2:
# enc each block separately
# CTR1 is the highest counter (even if no LOOP done)
vmovdqa $CTR1, $STATE1
vpaddd one(%rip), $CTR1, $CTR1 # inc counter
vpxor ($KS), $STATE1, $STATE1
vaesenc 16($KS), $STATE1, $STATE1
vaesenc 32($KS), $STATE1, $STATE1
vaesenc 48($KS), $STATE1, $STATE1
vaesenc 64($KS), $STATE1, $STATE1
vaesenc 80($KS), $STATE1, $STATE1
vaesenc 96($KS), $STATE1, $STATE1
vaesenc 112($KS), $STATE1, $STATE1
vaesenc 128($KS), $STATE1, $STATE1
vaesenc 144($KS), $STATE1, $STATE1
vaesenclast 160($KS), $STATE1, $STATE1
# XOR with plaintext
vpxor ($PT), $STATE1, $STATE1
vmovdqu $STATE1, ($CT)
addq \$16, $PT
addq \$16, $CT
subq \$1, %r10
jne .L128_enc_msg_x4_loop2
.L128_enc_msg_x4_out:
popq %r13
.cfi_pop %r13
popq %r12
.cfi_pop %r12
ret
.cfi_endproc
.size aes128gcmsiv_enc_msg_x4,.-aes128gcmsiv_enc_msg_x4
___
}
aes128gcmsiv_enc_msg_x4();
sub aes128gcmsiv_enc_msg_x8 {
my $STATE1 = "%xmm1";
my $STATE2 = "%xmm2";
my $STATE3 = "%xmm3";
my $STATE4 = "%xmm4";
my $STATE5 = "%xmm5";
my $STATE6 = "%xmm6";
my $STATE7 = "%xmm7";
my $STATE8 = "%xmm8";
my $CTR1 = "%xmm0";
my $CTR2 = "%xmm9";
my $CTR3 = "%xmm10";
my $CTR4 = "%xmm11";
my $CTR5 = "%xmm12";
my $CTR6 = "%xmm13";
my $CTR7 = "%xmm14";
my $SCHED = "%xmm15";
my $TMP1 = "%xmm1";
my $TMP2 = "%xmm2";
my $PT = "%rdi";
my $CT = "%rsi";
my $TAG = "%rdx";
my $KS = "%rcx";
my $LEN = "%r8";
my $aes_round8 = sub {
my ($i) = @_;
return <<___;
vmovdqu ${\eval($i*16)}($KS), $SCHED
vaesenc $SCHED, $STATE1, $STATE1
vaesenc $SCHED, $STATE2, $STATE2
vaesenc $SCHED, $STATE3, $STATE3
vaesenc $SCHED, $STATE4, $STATE4
vaesenc $SCHED, $STATE5, $STATE5
vaesenc $SCHED, $STATE6, $STATE6
vaesenc $SCHED, $STATE7, $STATE7
vaesenc $SCHED, $STATE8, $STATE8
___
};
my $aes_lastround8 = sub {
my ($i) = @_;
return <<___;
vmovdqu ${\eval($i*16)}($KS), $SCHED
vaesenclast $SCHED, $STATE1, $STATE1
vaesenclast $SCHED, $STATE2, $STATE2
vaesenclast $SCHED, $STATE3, $STATE3
vaesenclast $SCHED, $STATE4, $STATE4
vaesenclast $SCHED, $STATE5, $STATE5
vaesenclast $SCHED, $STATE6, $STATE6
vaesenclast $SCHED, $STATE7, $STATE7
vaesenclast $SCHED, $STATE8, $STATE8
___
};
# void ENC_MSG_x8(unsigned char* PT,
# unsigned char* CT,
# unsigned char* TAG,
# unsigned char* KS,
# size_t byte_len);
# parameter 1: %rdi #PT
# parameter 2: %rsi #CT
# parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32]
# parameter 4: %rcx #KS
# parameter 5: %r8 #LEN MSG_length in bytes
$code.=<<___;
.globl aes128gcmsiv_enc_msg_x8
.type aes128gcmsiv_enc_msg_x8,\@function,5
.align 16
aes128gcmsiv_enc_msg_x8:
.cfi_startproc
test $LEN, $LEN
jnz .L128_enc_msg_x8_start
ret
.L128_enc_msg_x8_start:
pushq %r12
.cfi_push %r12
pushq %r13
.cfi_push %r13
pushq %rbp
.cfi_push %rbp
movq %rsp, %rbp
.cfi_def_cfa_register rbp
# Place in stack
subq \$128, %rsp
andq \$-64, %rsp
shrq \$4, $LEN # LEN = num of blocks
movq $LEN, %r10
shlq \$61, %r10
shrq \$61, %r10
# make IV from TAG
vmovdqu ($TAG), $TMP1
vpor OR_MASK(%rip), $TMP1, $TMP1 # TMP1= IV = [1]TAG[126...32][00..00]
# store counter8 in the stack
vpaddd seven(%rip), $TMP1, $CTR1
vmovdqu $CTR1, (%rsp) # CTR8 = TAG[127...32][00..07]
vpaddd one(%rip), $TMP1, $CTR2 # CTR2 = TAG[127...32][00..01]
vpaddd two(%rip), $TMP1, $CTR3 # CTR3 = TAG[127...32][00..02]
vpaddd three(%rip), $TMP1, $CTR4 # CTR4 = TAG[127...32][00..03]
vpaddd four(%rip), $TMP1, $CTR5 # CTR5 = TAG[127...32][00..04]
vpaddd five(%rip), $TMP1, $CTR6 # CTR6 = TAG[127...32][00..05]
vpaddd six(%rip), $TMP1, $CTR7 # CTR7 = TAG[127...32][00..06]
vmovdqa $TMP1, $CTR1 # CTR1 = TAG[127...32][00..00]
shrq \$3, $LEN
je .L128_enc_msg_x8_check_remainder
subq \$128, $CT
subq \$128, $PT
.L128_enc_msg_x8_loop1:
addq \$128, $CT
addq \$128, $PT
vmovdqa $CTR1, $STATE1
vmovdqa $CTR2, $STATE2
vmovdqa $CTR3, $STATE3
vmovdqa $CTR4, $STATE4
vmovdqa $CTR5, $STATE5
vmovdqa $CTR6, $STATE6
vmovdqa $CTR7, $STATE7
# move from stack
vmovdqu (%rsp), $STATE8
vpxor ($KS), $STATE1, $STATE1
vpxor ($KS), $STATE2, $STATE2
vpxor ($KS), $STATE3, $STATE3
vpxor ($KS), $STATE4, $STATE4
vpxor ($KS), $STATE5, $STATE5
vpxor ($KS), $STATE6, $STATE6
vpxor ($KS), $STATE7, $STATE7
vpxor ($KS), $STATE8, $STATE8
${\$aes_round8->(1)}
vmovdqu (%rsp), $CTR7 # deal with CTR8
vpaddd eight(%rip), $CTR7, $CTR7
vmovdqu $CTR7, (%rsp)
${\$aes_round8->(2)}
vpsubd one(%rip), $CTR7, $CTR7
${\$aes_round8->(3)}
vpaddd eight(%rip), $CTR1, $CTR1
${\$aes_round8->(4)}
vpaddd eight(%rip), $CTR2, $CTR2
${\$aes_round8->(5)}
vpaddd eight(%rip), $CTR3, $CTR3
${\$aes_round8->(6)}
vpaddd eight(%rip), $CTR4, $CTR4
${\$aes_round8->(7)}
vpaddd eight(%rip), $CTR5, $CTR5
${\$aes_round8->(8)}
vpaddd eight(%rip), $CTR6, $CTR6
${\$aes_round8->(9)}
${\$aes_lastround8->(10)}
# XOR with Plaintext
vpxor 0*16($PT), $STATE1, $STATE1
vpxor 1*16($PT), $STATE2, $STATE2
vpxor 2*16($PT), $STATE3, $STATE3
vpxor 3*16($PT), $STATE4, $STATE4
vpxor 4*16($PT), $STATE5, $STATE5
vpxor 5*16($PT), $STATE6, $STATE6
vpxor 6*16($PT), $STATE7, $STATE7
vpxor 7*16($PT), $STATE8, $STATE8
dec $LEN
vmovdqu $STATE1, 0*16($CT)
vmovdqu $STATE2, 1*16($CT)
vmovdqu $STATE3, 2*16($CT)
vmovdqu $STATE4, 3*16($CT)
vmovdqu $STATE5, 4*16($CT)
vmovdqu $STATE6, 5*16($CT)
vmovdqu $STATE7, 6*16($CT)
vmovdqu $STATE8, 7*16($CT)
jne .L128_enc_msg_x8_loop1
addq \$128, $CT
addq \$128, $PT
.L128_enc_msg_x8_check_remainder:
cmpq \$0, %r10
je .L128_enc_msg_x8_out
.L128_enc_msg_x8_loop2:
# enc each block separately
# CTR1 is the highest counter (even if no LOOP done)
vmovdqa $CTR1, $STATE1
vpaddd one(%rip), $CTR1, $CTR1 # inc counter
vpxor ($KS), $STATE1, $STATE1
vaesenc 16($KS), $STATE1, $STATE1
vaesenc 32($KS), $STATE1, $STATE1
vaesenc 48($KS), $STATE1, $STATE1
vaesenc 64($KS), $STATE1, $STATE1
vaesenc 80($KS), $STATE1, $STATE1
vaesenc 96($KS), $STATE1, $STATE1
vaesenc 112($KS), $STATE1, $STATE1
vaesenc 128($KS), $STATE1, $STATE1
vaesenc 144($KS), $STATE1, $STATE1
vaesenclast 160($KS), $STATE1, $STATE1
# XOR with Plaintext
vpxor ($PT), $STATE1, $STATE1
vmovdqu $STATE1, ($CT)
addq \$16, $PT
addq \$16, $CT
decq %r10
jne .L128_enc_msg_x8_loop2
.L128_enc_msg_x8_out:
movq %rbp, %rsp
.cfi_def_cfa_register %rsp
popq %rbp
.cfi_pop %rbp
popq %r13
.cfi_pop %r13
popq %r12
.cfi_pop %r12
ret
.cfi_endproc
.size aes128gcmsiv_enc_msg_x8,.-aes128gcmsiv_enc_msg_x8
___
}
aes128gcmsiv_enc_msg_x8();
sub aesgcmsiv_dec {
my ($aes256) = @_;
my $T = "%xmm0";
my $TMP0 = "%xmm1";
my $TMP1 = "%xmm2";
my $TMP2 = "%xmm3";
my $TMP3 = "%xmm4";
my $TMP4 = "%xmm5";
my $TMP5 = "%xmm6";
my $CTR1 = "%xmm7";
my $CTR2 = "%xmm8";
my $CTR3 = "%xmm9";
my $CTR4 = "%xmm10";
my $CTR5 = "%xmm11";
my $CTR6 = "%xmm12";
my $CTR = "%xmm15";
my $CT = "%rdi";
my $PT = "%rsi";
my $POL = "%rdx";
my $Htbl = "%rcx";
my $KS = "%r8";
my $LEN = "%r9";
my $secureBuffer = "%rax";
my $HTABLE_ROUNDS = "%xmm13";
my $labelPrefix = "128";
if ($aes256) {
$labelPrefix = "256";
}
my $aes_round_dec = sub {
my ($i) = @_;
return <<___;
vmovdqu ${\eval($i*16)}($KS), $TMP3
vaesenc $TMP3, $CTR1, $CTR1
vaesenc $TMP3, $CTR2, $CTR2
vaesenc $TMP3, $CTR3, $CTR3
vaesenc $TMP3, $CTR4, $CTR4
vaesenc $TMP3, $CTR5, $CTR5
vaesenc $TMP3, $CTR6, $CTR6
___
};
my $aes_lastround_dec = sub {
my ($i) = @_;
return <<___;
vmovdqu ${\eval($i*16)}($KS), $TMP3
vaesenclast $TMP3, $CTR1, $CTR1
vaesenclast $TMP3, $CTR2, $CTR2
vaesenclast $TMP3, $CTR3, $CTR3
vaesenclast $TMP3, $CTR4, $CTR4
vaesenclast $TMP3, $CTR5, $CTR5
vaesenclast $TMP3, $CTR6, $CTR6
___
};
my $schoolbook = sub {
my ($i) = @_;
return <<___;
vmovdqu ${\eval($i*16-32)}($secureBuffer), $TMP5
vmovdqu ${\eval($i*16-32)}($Htbl), $HTABLE_ROUNDS
vpclmulqdq \$0x10, $HTABLE_ROUNDS, $TMP5, $TMP3
vpxor $TMP3, $TMP0, $TMP0
vpclmulqdq \$0x11, $HTABLE_ROUNDS, $TMP5, $TMP3
vpxor $TMP3, $TMP1, $TMP1
vpclmulqdq \$0x00, $HTABLE_ROUNDS, $TMP5, $TMP3
vpxor $TMP3, $TMP2, $TMP2
vpclmulqdq \$0x01, $HTABLE_ROUNDS, $TMP5, $TMP3
vpxor $TMP3, $TMP0, $TMP0
___
};
if ($aes256) {
$code.=<<___;
.globl aes256gcmsiv_dec
.type aes256gcmsiv_dec,\@function,6
.align 16
aes256gcmsiv_dec:
___
} else {
$code.=<<___;
.globl aes128gcmsiv_dec
.type aes128gcmsiv_dec,\@function,6
.align 16
aes128gcmsiv_dec:
___
}
$code.=<<___;
.cfi_startproc
test \$~15, $LEN
jnz .L${labelPrefix}_dec_start
ret
.L${labelPrefix}_dec_start:
vzeroupper
vmovdqa ($POL), $T
movq $POL, $secureBuffer
leaq 32($secureBuffer), $secureBuffer
leaq 32($Htbl), $Htbl
# make CTRBLKs from given tag.
vmovdqu ($CT,$LEN), $CTR
vpor OR_MASK(%rip), $CTR, $CTR # CTR = [1]TAG[126...32][00..00]
andq \$~15, $LEN
# If less then 6 blocks, make singles
cmp \$96, $LEN
jb .L${labelPrefix}_dec_loop2
# Decrypt the first six blocks
sub \$96, $LEN
vmovdqa $CTR, $CTR1
vpaddd one(%rip), $CTR1, $CTR2
vpaddd two(%rip), $CTR1, $CTR3
vpaddd one(%rip), $CTR3, $CTR4
vpaddd two(%rip), $CTR3, $CTR5
vpaddd one(%rip), $CTR5, $CTR6
vpaddd two(%rip), $CTR5, $CTR
vpxor ($KS), $CTR1, $CTR1
vpxor ($KS), $CTR2, $CTR2
vpxor ($KS), $CTR3, $CTR3
vpxor ($KS), $CTR4, $CTR4
vpxor ($KS), $CTR5, $CTR5
vpxor ($KS), $CTR6, $CTR6
${\$aes_round_dec->(1)}
${\$aes_round_dec->(2)}
${\$aes_round_dec->(3)}
${\$aes_round_dec->(4)}
${\$aes_round_dec->(5)}
${\$aes_round_dec->(6)}
${\$aes_round_dec->(7)}
${\$aes_round_dec->(8)}
${\$aes_round_dec->(9)}
___
if ($aes256) {
$code.=<<___;
${\$aes_round_dec->(10)}
${\$aes_round_dec->(11)}
${\$aes_round_dec->(12)}
${\$aes_round_dec->(13)}
${\$aes_lastround_dec->(14)}
___
} else {
$code.=<<___;
${\$aes_lastround_dec->(10)}
___
}
$code.=<<___;
# XOR with CT
vpxor 0*16($CT), $CTR1, $CTR1
vpxor 1*16($CT), $CTR2, $CTR2
vpxor 2*16($CT), $CTR3, $CTR3
vpxor 3*16($CT), $CTR4, $CTR4
vpxor 4*16($CT), $CTR5, $CTR5
vpxor 5*16($CT), $CTR6, $CTR6
vmovdqu $CTR1, 0*16($PT)
vmovdqu $CTR2, 1*16($PT)
vmovdqu $CTR3, 2*16($PT)
vmovdqu $CTR4, 3*16($PT)
vmovdqu $CTR5, 4*16($PT)
vmovdqu $CTR6, 5*16($PT)
addq \$96, $CT
addq \$96, $PT
jmp .L${labelPrefix}_dec_loop1
# Decrypt 6 blocks each time while hashing previous 6 blocks
.align 64
.L${labelPrefix}_dec_loop1:
cmp \$96, $LEN
jb .L${labelPrefix}_dec_finish_96
sub \$96, $LEN
vmovdqa $CTR6, $TMP5
vmovdqa $CTR5, 1*16-32($secureBuffer)
vmovdqa $CTR4, 2*16-32($secureBuffer)
vmovdqa $CTR3, 3*16-32($secureBuffer)
vmovdqa $CTR2, 4*16-32($secureBuffer)
vmovdqa $CTR1, 5*16-32($secureBuffer)
vmovdqa $CTR, $CTR1
vpaddd one(%rip), $CTR1, $CTR2
vpaddd two(%rip), $CTR1, $CTR3
vpaddd one(%rip), $CTR3, $CTR4
vpaddd two(%rip), $CTR3, $CTR5
vpaddd one(%rip), $CTR5, $CTR6
vpaddd two(%rip), $CTR5, $CTR
vmovdqa ($KS), $TMP3
vpxor $TMP3, $CTR1, $CTR1
vpxor $TMP3, $CTR2, $CTR2
vpxor $TMP3, $CTR3, $CTR3
vpxor $TMP3, $CTR4, $CTR4
vpxor $TMP3, $CTR5, $CTR5
vpxor $TMP3, $CTR6, $CTR6
vmovdqu 0*16-32($Htbl), $TMP3
vpclmulqdq \$0x11, $TMP3, $TMP5, $TMP1
vpclmulqdq \$0x00, $TMP3, $TMP5, $TMP2
vpclmulqdq \$0x01, $TMP3, $TMP5, $TMP0
vpclmulqdq \$0x10, $TMP3, $TMP5, $TMP3
vpxor $TMP3, $TMP0, $TMP0
${\$aes_round_dec->(1)}
${\$schoolbook->(1)}
${\$aes_round_dec->(2)}
${\$schoolbook->(2)}
${\$aes_round_dec->(3)}
${\$schoolbook->(3)}
${\$aes_round_dec->(4)}
${\$schoolbook->(4)}
${\$aes_round_dec->(5)}
${\$aes_round_dec->(6)}
${\$aes_round_dec->(7)}
vmovdqa 5*16-32($secureBuffer), $TMP5
vpxor $T, $TMP5, $TMP5
vmovdqu 5*16-32($Htbl), $TMP4
vpclmulqdq \$0x01, $TMP4, $TMP5, $TMP3
vpxor $TMP3, $TMP0, $TMP0
vpclmulqdq \$0x11, $TMP4, $TMP5, $TMP3
vpxor $TMP3, $TMP1, $TMP1
vpclmulqdq \$0x00, $TMP4, $TMP5, $TMP3
vpxor $TMP3, $TMP2, $TMP2
vpclmulqdq \$0x10, $TMP4, $TMP5, $TMP3
vpxor $TMP3, $TMP0, $TMP0
${\$aes_round_dec->(8)}
vpsrldq \$8, $TMP0, $TMP3
vpxor $TMP3, $TMP1, $TMP4
vpslldq \$8, $TMP0, $TMP3
vpxor $TMP3, $TMP2, $T
vmovdqa poly(%rip), $TMP2
${\$aes_round_dec->(9)}
___
if ($aes256) {
$code.=<<___;
${\$aes_round_dec->(10)}
${\$aes_round_dec->(11)}
${\$aes_round_dec->(12)}
${\$aes_round_dec->(13)}
vmovdqu 14*16($KS), $TMP5
___
} else {
$code.=<<___;
vmovdqu 10*16($KS), $TMP5
___
}
$code.=<<___;
vpalignr \$8, $T, $T, $TMP1
vpclmulqdq \$0x10, $TMP2, $T, $T
vpxor $T, $TMP1, $T
vpxor 0*16($CT), $TMP5, $TMP3
vaesenclast $TMP3, $CTR1, $CTR1
vpxor 1*16($CT), $TMP5, $TMP3
vaesenclast $TMP3, $CTR2, $CTR2
vpxor 2*16($CT), $TMP5, $TMP3
vaesenclast $TMP3, $CTR3, $CTR3
vpxor 3*16($CT), $TMP5, $TMP3
vaesenclast $TMP3, $CTR4, $CTR4
vpxor 4*16($CT), $TMP5, $TMP3
vaesenclast $TMP3, $CTR5, $CTR5
vpxor 5*16($CT), $TMP5, $TMP3
vaesenclast $TMP3, $CTR6, $CTR6
vpalignr \$8, $T, $T, $TMP1
vpclmulqdq \$0x10, $TMP2, $T, $T
vpxor $T, $TMP1, $T
vmovdqu $CTR1, 0*16($PT)
vmovdqu $CTR2, 1*16($PT)
vmovdqu $CTR3, 2*16($PT)
vmovdqu $CTR4, 3*16($PT)
vmovdqu $CTR5, 4*16($PT)
vmovdqu $CTR6, 5*16($PT)
vpxor $TMP4, $T, $T
lea 96($CT), $CT
lea 96($PT), $PT
jmp .L${labelPrefix}_dec_loop1
.L${labelPrefix}_dec_finish_96:
vmovdqa $CTR6, $TMP5
vmovdqa $CTR5, 1*16-32($secureBuffer)
vmovdqa $CTR4, 2*16-32($secureBuffer)
vmovdqa $CTR3, 3*16-32($secureBuffer)
vmovdqa $CTR2, 4*16-32($secureBuffer)
vmovdqa $CTR1, 5*16-32($secureBuffer)
vmovdqu 0*16-32($Htbl), $TMP3
vpclmulqdq \$0x10, $TMP3, $TMP5, $TMP0
vpclmulqdq \$0x11, $TMP3, $TMP5, $TMP1
vpclmulqdq \$0x00, $TMP3, $TMP5, $TMP2
vpclmulqdq \$0x01, $TMP3, $TMP5, $TMP3
vpxor $TMP3, $TMP0, $TMP0
${\$schoolbook->(1)}
${\$schoolbook->(2)}
${\$schoolbook->(3)}
${\$schoolbook->(4)}
vmovdqu 5*16-32($secureBuffer), $TMP5
vpxor $T, $TMP5, $TMP5
vmovdqu 5*16-32($Htbl), $TMP4
vpclmulqdq \$0x11, $TMP4, $TMP5, $TMP3
vpxor $TMP3, $TMP1, $TMP1
vpclmulqdq \$0x00, $TMP4, $TMP5, $TMP3
vpxor $TMP3, $TMP2, $TMP2
vpclmulqdq \$0x10, $TMP4, $TMP5, $TMP3
vpxor $TMP3, $TMP0, $TMP0
vpclmulqdq \$0x01, $TMP4, $TMP5, $TMP3
vpxor $TMP3, $TMP0, $TMP0
vpsrldq \$8, $TMP0, $TMP3
vpxor $TMP3, $TMP1, $TMP4
vpslldq \$8, $TMP0, $TMP3
vpxor $TMP3, $TMP2, $T
vmovdqa poly(%rip), $TMP2
vpalignr \$8, $T, $T, $TMP1
vpclmulqdq \$0x10, $TMP2, $T, $T
vpxor $T, $TMP1, $T
vpalignr \$8, $T, $T, $TMP1
vpclmulqdq \$0x10, $TMP2, $T, $T
vpxor $T, $TMP1, $T
vpxor $TMP4, $T, $T
.L${labelPrefix}_dec_loop2:
# Here we encrypt any remaining whole block
# if there are no whole blocks
cmp \$16, $LEN
jb .L${labelPrefix}_dec_out
sub \$16, $LEN
vmovdqa $CTR, $TMP1
vpaddd one(%rip), $CTR, $CTR
vpxor 0*16($KS), $TMP1, $TMP1
vaesenc 1*16($KS), $TMP1, $TMP1
vaesenc 2*16($KS), $TMP1, $TMP1
vaesenc 3*16($KS), $TMP1, $TMP1
vaesenc 4*16($KS), $TMP1, $TMP1
vaesenc 5*16($KS), $TMP1, $TMP1
vaesenc 6*16($KS), $TMP1, $TMP1
vaesenc 7*16($KS), $TMP1, $TMP1
vaesenc 8*16($KS), $TMP1, $TMP1
vaesenc 9*16($KS), $TMP1, $TMP1
___
if ($aes256) {
$code.=<<___;
vaesenc 10*16($KS), $TMP1, $TMP1
vaesenc 11*16($KS), $TMP1, $TMP1
vaesenc 12*16($KS), $TMP1, $TMP1
vaesenc 13*16($KS), $TMP1, $TMP1
vaesenclast 14*16($KS), $TMP1, $TMP1
___
} else {
$code.=<<___;
vaesenclast 10*16($KS), $TMP1, $TMP1
___
}
$code.=<<___;
vpxor ($CT), $TMP1, $TMP1
vmovdqu $TMP1, ($PT)
addq \$16, $CT
addq \$16, $PT
vpxor $TMP1, $T, $T
vmovdqa -32($Htbl), $TMP0
call GFMUL
jmp .L${labelPrefix}_dec_loop2
.L${labelPrefix}_dec_out:
vmovdqu $T, ($POL)
ret
.cfi_endproc
___
if ($aes256) {
$code.=<<___;
.size aes256gcmsiv_dec, .-aes256gcmsiv_dec
___
} else {
$code.=<<___;
.size aes128gcmsiv_dec, .-aes128gcmsiv_dec
___
}
}
aesgcmsiv_dec(0); # emit 128-bit version
sub aes128gcmsiv_ecb_enc_block {
my $STATE_1 = "%xmm1";
my $KSp = "%rdx";
# parameter 1: PT %rdi (pointer to 128 bit)
# parameter 2: CT %rsi (pointer to 128 bit)
# parameter 3: ks %rdx (pointer to ks)
$code.=<<___;
.globl aes128gcmsiv_ecb_enc_block
.type aes128gcmsiv_ecb_enc_block,\@function,3
.align 16
aes128gcmsiv_ecb_enc_block:
.cfi_startproc
vmovdqa (%rdi), $STATE_1
vpxor ($KSp), $STATE_1, $STATE_1
vaesenc 1*16($KSp), $STATE_1, $STATE_1
vaesenc 2*16($KSp), $STATE_1, $STATE_1
vaesenc 3*16($KSp), $STATE_1, $STATE_1
vaesenc 4*16($KSp), $STATE_1, $STATE_1
vaesenc 5*16($KSp), $STATE_1, $STATE_1
vaesenc 6*16($KSp), $STATE_1, $STATE_1
vaesenc 7*16($KSp), $STATE_1, $STATE_1
vaesenc 8*16($KSp), $STATE_1, $STATE_1
vaesenc 9*16($KSp), $STATE_1, $STATE_1
vaesenclast 10*16($KSp), $STATE_1, $STATE_1 # STATE_1 == IV
vmovdqa $STATE_1, (%rsi)
ret
.cfi_endproc
.size aes128gcmsiv_ecb_enc_block,.-aes128gcmsiv_ecb_enc_block
___
}
aes128gcmsiv_ecb_enc_block();
sub aes256gcmsiv_aes_ks_enc_x1 {
my $KS = "%rdx";
my $KEYp = "%rcx";
my $CON_MASK = "%xmm0";
my $MASK_256 = "%xmm15";
my $KEY_1 = "%xmm1";
my $KEY_2 = "%xmm3";
my $BLOCK1 = "%xmm8";
my $AUX_REG = "%xmm14";
my $PT = "%rdi";
my $CT = "%rsi";
my $round_double = sub {
my ($i, $j) = @_;
return <<___;
vpshufb %xmm15, %xmm3, %xmm2
vaesenclast %xmm0, %xmm2, %xmm2
vpslld \$1, %xmm0, %xmm0
vpslldq \$4, %xmm1, %xmm4
vpxor %xmm4, %xmm1, %xmm1
vpslldq \$4, %xmm4, %xmm4
vpxor %xmm4, %xmm1, %xmm1
vpslldq \$4, %xmm4, %xmm4
vpxor %xmm4, %xmm1, %xmm1
vpxor %xmm2, %xmm1, %xmm1
vaesenc %xmm1, $BLOCK1, $BLOCK1
vmovdqu %xmm1, ${\eval(16*$i)}($KS)
vpshufd \$0xff, %xmm1, %xmm2
vaesenclast %xmm14, %xmm2, %xmm2
vpslldq \$4, %xmm3, %xmm4
vpxor %xmm4, %xmm3, %xmm3
vpslldq \$4, %xmm4, %xmm4
vpxor %xmm4, %xmm3, %xmm3
vpslldq \$4, %xmm4, %xmm4
vpxor %xmm4, %xmm3, %xmm3
vpxor %xmm2, %xmm3, %xmm3
vaesenc %xmm3, $BLOCK1, $BLOCK1
vmovdqu %xmm3, ${\eval(16*$j)}($KS)
___
};
my $round_last = sub {
my ($i) = @_;
return <<___;
vpshufb %xmm15, %xmm3, %xmm2
vaesenclast %xmm0, %xmm2, %xmm2
vpslldq \$4, %xmm1, %xmm4
vpxor %xmm4, %xmm1, %xmm1
vpslldq \$4, %xmm4, %xmm4
vpxor %xmm4, %xmm1, %xmm1
vpslldq \$4, %xmm4, %xmm4
vpxor %xmm4, %xmm1, %xmm1
vpxor %xmm2, %xmm1, %xmm1
vaesenclast %xmm1, $BLOCK1, $BLOCK1
vmovdqu %xmm1, ${\eval(16*$i)}($KS)
___
};
# parameter 1: %rdi Pointer to PT1
# parameter 2: %rsi Pointer to CT1
# parameter 3: %rdx Pointer to KS
# parameter 4: %rcx Pointer to initial key
$code.=<<___;
.globl aes256gcmsiv_aes_ks_enc_x1
.type aes256gcmsiv_aes_ks_enc_x1,\@function,4
.align 16
aes256gcmsiv_aes_ks_enc_x1:
.cfi_startproc
vmovdqa con1(%rip), $CON_MASK # CON_MASK = 1,1,1,1
vmovdqa mask(%rip), $MASK_256 # MASK_256
vmovdqa ($PT), $BLOCK1
vmovdqa ($KEYp), $KEY_1 # KEY_1 || KEY_2 [0..7] = user key
vmovdqa 16($KEYp), $KEY_2
vpxor $KEY_1, $BLOCK1, $BLOCK1
vaesenc $KEY_2, $BLOCK1, $BLOCK1
vmovdqu $KEY_1, ($KS) # First round key
vmovdqu $KEY_2, 16($KS)
vpxor $AUX_REG, $AUX_REG, $AUX_REG
${\$round_double->(2, 3)}
${\$round_double->(4, 5)}
${\$round_double->(6, 7)}
${\$round_double->(8, 9)}
${\$round_double->(10, 11)}
${\$round_double->(12, 13)}
${\$round_last->(14)}
vmovdqa $BLOCK1, ($CT)
ret
.cfi_endproc
.size aes256gcmsiv_aes_ks_enc_x1,.-aes256gcmsiv_aes_ks_enc_x1
___
}
aes256gcmsiv_aes_ks_enc_x1();
sub aes256gcmsiv_ecb_enc_block {
my $STATE_1 = "%xmm1";
my $PT = "%rdi";
my $CT = "%rsi";
my $KSp = "%rdx";
# parameter 1: PT %rdi (pointer to 128 bit)
# parameter 2: CT %rsi (pointer to 128 bit)
# parameter 3: ks %rdx (pointer to ks)
$code.=<<___;
.globl aes256gcmsiv_ecb_enc_block
.type aes256gcmsiv_ecb_enc_block,\@function,3
.align 16
aes256gcmsiv_ecb_enc_block:
.cfi_startproc
vmovdqa (%rdi), $STATE_1
vpxor ($KSp), $STATE_1, $STATE_1
vaesenc 1*16($KSp), $STATE_1, $STATE_1
vaesenc 2*16($KSp), $STATE_1, $STATE_1
vaesenc 3*16($KSp), $STATE_1, $STATE_1
vaesenc 4*16($KSp), $STATE_1, $STATE_1
vaesenc 5*16($KSp), $STATE_1, $STATE_1
vaesenc 6*16($KSp), $STATE_1, $STATE_1
vaesenc 7*16($KSp), $STATE_1, $STATE_1
vaesenc 8*16($KSp), $STATE_1, $STATE_1
vaesenc 9*16($KSp), $STATE_1, $STATE_1
vaesenc 10*16($KSp), $STATE_1, $STATE_1
vaesenc 11*16($KSp), $STATE_1, $STATE_1
vaesenc 12*16($KSp), $STATE_1, $STATE_1
vaesenc 13*16($KSp), $STATE_1, $STATE_1
vaesenclast 14*16($KSp), $STATE_1, $STATE_1 # $STATE_1 == IV
vmovdqa $STATE_1, (%rsi)
ret
.cfi_endproc
.size aes256gcmsiv_ecb_enc_block,.-aes256gcmsiv_ecb_enc_block
___
}
aes256gcmsiv_ecb_enc_block();
sub aes256gcmsiv_enc_msg_x4 {
my $CTR1 = "%xmm0";
my $CTR2 = "%xmm1";
my $CTR3 = "%xmm2";
my $CTR4 = "%xmm3";
my $ADDER = "%xmm4";
my $STATE1 = "%xmm5";
my $STATE2 = "%xmm6";
my $STATE3 = "%xmm7";
my $STATE4 = "%xmm8";
my $TMP = "%xmm12";
my $TMP2 = "%xmm13";
my $TMP3 = "%xmm14";
my $IV = "%xmm15";
my $PT = "%rdi";
my $CT = "%rsi";
my $TAG = "%rdx";
my $KS = "%rcx";
my $LEN = "%r8";
my $aes_round = sub {
my ($i) = @_;
return <<___;
vmovdqu ${\eval($i*16)}($KS), $TMP
vaesenc $TMP, $STATE1, $STATE1
vaesenc $TMP, $STATE2, $STATE2
vaesenc $TMP, $STATE3, $STATE3
vaesenc $TMP, $STATE4, $STATE4
___
};
my $aes_lastround = sub {
my ($i) = @_;
return <<___;
vmovdqu ${\eval($i*16)}($KS), $TMP
vaesenclast $TMP, $STATE1, $STATE1
vaesenclast $TMP, $STATE2, $STATE2
vaesenclast $TMP, $STATE3, $STATE3
vaesenclast $TMP, $STATE4, $STATE4
___
};
# void aes256gcmsiv_enc_msg_x4(unsigned char* PT, unsigned char* CT,
# unsigned char* TAG, unsigned char* KS,
# size_t byte_len);
# parameter 1: %rdi #PT
# parameter 2: %rsi #CT
# parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32]
# parameter 4: %rcx #KS
# parameter 5: %r8 #LEN MSG_length in bytes
$code.=<<___;
.globl aes256gcmsiv_enc_msg_x4
.type aes256gcmsiv_enc_msg_x4,\@function,5
.align 16
aes256gcmsiv_enc_msg_x4:
.cfi_startproc
test $LEN, $LEN
jnz .L256_enc_msg_x4_start
ret
.L256_enc_msg_x4_start:
movq $LEN, %r10
shrq \$4, $LEN # LEN = num of blocks
shlq \$60, %r10
jz .L256_enc_msg_x4_start2
addq \$1, $LEN
.L256_enc_msg_x4_start2:
movq $LEN, %r10
shlq \$62, %r10
shrq \$62, %r10
# make IV from TAG
vmovdqa ($TAG), $IV
vpor OR_MASK(%rip), $IV, $IV # IV = [1]TAG[126...32][00..00]
vmovdqa four(%rip), $ADDER # Register to increment counters
vmovdqa $IV, $CTR1 # CTR1 = TAG[1][127...32][00..00]
vpaddd one(%rip), $IV, $CTR2 # CTR2 = TAG[1][127...32][00..01]
vpaddd two(%rip), $IV, $CTR3 # CTR3 = TAG[1][127...32][00..02]
vpaddd three(%rip), $IV, $CTR4 # CTR4 = TAG[1][127...32][00..03]
shrq \$2, $LEN
je .L256_enc_msg_x4_check_remainder
subq \$64, $CT
subq \$64, $PT
.L256_enc_msg_x4_loop1:
addq \$64, $CT
addq \$64, $PT
vmovdqa $CTR1, $STATE1
vmovdqa $CTR2, $STATE2
vmovdqa $CTR3, $STATE3
vmovdqa $CTR4, $STATE4
vpxor ($KS), $STATE1, $STATE1
vpxor ($KS), $STATE2, $STATE2
vpxor ($KS), $STATE3, $STATE3
vpxor ($KS), $STATE4, $STATE4
${\$aes_round->(1)}
vpaddd $ADDER, $CTR1, $CTR1
${\$aes_round->(2)}
vpaddd $ADDER, $CTR2, $CTR2
${\$aes_round->(3)}
vpaddd $ADDER, $CTR3, $CTR3
${\$aes_round->(4)}
vpaddd $ADDER, $CTR4, $CTR4
${\$aes_round->(5)}
${\$aes_round->(6)}
${\$aes_round->(7)}
${\$aes_round->(8)}
${\$aes_round->(9)}
${\$aes_round->(10)}
${\$aes_round->(11)}
${\$aes_round->(12)}
${\$aes_round->(13)}
${\$aes_lastround->(14)}
# XOR with Plaintext
vpxor 0*16($PT), $STATE1, $STATE1
vpxor 1*16($PT), $STATE2, $STATE2
vpxor 2*16($PT), $STATE3, $STATE3
vpxor 3*16($PT), $STATE4, $STATE4
subq \$1, $LEN
vmovdqu $STATE1, 0*16($CT)
vmovdqu $STATE2, 1*16($CT)
vmovdqu $STATE3, 2*16($CT)
vmovdqu $STATE4, 3*16($CT)
jne .L256_enc_msg_x4_loop1
addq \$64, $CT
addq \$64, $PT
.L256_enc_msg_x4_check_remainder:
cmpq \$0, %r10
je .L256_enc_msg_x4_out
.L256_enc_msg_x4_loop2:
# encrypt each block separately
# CTR1 is the highest counter (even if no LOOP done)
vmovdqa $CTR1, $STATE1
vpaddd one(%rip), $CTR1, $CTR1 # inc counter
vpxor ($KS), $STATE1, $STATE1
vaesenc 16($KS), $STATE1, $STATE1
vaesenc 32($KS), $STATE1, $STATE1
vaesenc 48($KS), $STATE1, $STATE1
vaesenc 64($KS), $STATE1, $STATE1
vaesenc 80($KS), $STATE1, $STATE1
vaesenc 96($KS), $STATE1, $STATE1
vaesenc 112($KS), $STATE1, $STATE1
vaesenc 128($KS), $STATE1, $STATE1
vaesenc 144($KS), $STATE1, $STATE1
vaesenc 160($KS), $STATE1, $STATE1
vaesenc 176($KS), $STATE1, $STATE1
vaesenc 192($KS), $STATE1, $STATE1
vaesenc 208($KS), $STATE1, $STATE1
vaesenclast 224($KS), $STATE1, $STATE1
# XOR with Plaintext
vpxor ($PT), $STATE1, $STATE1
vmovdqu $STATE1, ($CT)
addq \$16, $PT
addq \$16, $CT
subq \$1, %r10
jne .L256_enc_msg_x4_loop2
.L256_enc_msg_x4_out:
ret
.cfi_endproc
.size aes256gcmsiv_enc_msg_x4,.-aes256gcmsiv_enc_msg_x4
___
}
aes256gcmsiv_enc_msg_x4();
sub aes256gcmsiv_enc_msg_x8() {
my $STATE1 = "%xmm1";
my $STATE2 = "%xmm2";
my $STATE3 = "%xmm3";
my $STATE4 = "%xmm4";
my $STATE5 = "%xmm5";
my $STATE6 = "%xmm6";
my $STATE7 = "%xmm7";
my $STATE8 = "%xmm8";
my $CTR1 = "%xmm0";
my $CTR2 = "%xmm9";
my $CTR3 = "%xmm10";
my $CTR4 = "%xmm11";
my $CTR5 = "%xmm12";
my $CTR6 = "%xmm13";
my $CTR7 = "%xmm14";
my $TMP1 = "%xmm1";
my $TMP2 = "%xmm2";
my $KS = "%rcx";
my $LEN = "%r8";
my $PT = "%rdi";
my $CT = "%rsi";
my $TAG = "%rdx";
my $SCHED = "%xmm15";
my $aes_round8 = sub {
my ($i) = @_;
return <<___;
vmovdqu ${\eval($i*16)}($KS), $SCHED
vaesenc $SCHED, $STATE1, $STATE1
vaesenc $SCHED, $STATE2, $STATE2
vaesenc $SCHED, $STATE3, $STATE3
vaesenc $SCHED, $STATE4, $STATE4
vaesenc $SCHED, $STATE5, $STATE5
vaesenc $SCHED, $STATE6, $STATE6
vaesenc $SCHED, $STATE7, $STATE7
vaesenc $SCHED, $STATE8, $STATE8
___
};
my $aes_lastround8 = sub {
my ($i) = @_;
return <<___;
vmovdqu ${\eval($i*16)}($KS), $SCHED
vaesenclast $SCHED, $STATE1, $STATE1
vaesenclast $SCHED, $STATE2, $STATE2
vaesenclast $SCHED, $STATE3, $STATE3
vaesenclast $SCHED, $STATE4, $STATE4
vaesenclast $SCHED, $STATE5, $STATE5
vaesenclast $SCHED, $STATE6, $STATE6
vaesenclast $SCHED, $STATE7, $STATE7
vaesenclast $SCHED, $STATE8, $STATE8
___
};
# void ENC_MSG_x8(unsigned char* PT,
# unsigned char* CT,
# unsigned char* TAG,
# unsigned char* KS,
# size_t byte_len);
# parameter 1: %rdi #PT
# parameter 2: %rsi #CT
# parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32]
# parameter 4: %rcx #KS
# parameter 5: %r8 #LEN MSG_length in bytes
$code.=<<___;
.globl aes256gcmsiv_enc_msg_x8
.type aes256gcmsiv_enc_msg_x8,\@function,5
.align 16
aes256gcmsiv_enc_msg_x8:
.cfi_startproc
test $LEN, $LEN
jnz .L256_enc_msg_x8_start
ret
.L256_enc_msg_x8_start:
# Place in stack
movq %rsp, %r11
subq \$16, %r11
andq \$-64, %r11
movq $LEN, %r10
shrq \$4, $LEN # LEN = num of blocks
shlq \$60, %r10
jz .L256_enc_msg_x8_start2
addq \$1, $LEN
.L256_enc_msg_x8_start2:
movq $LEN, %r10
shlq \$61, %r10
shrq \$61, %r10
# Make IV from TAG
vmovdqa ($TAG), $TMP1
vpor OR_MASK(%rip), $TMP1, $TMP1 # TMP1= IV = [1]TAG[126...32][00..00]
# store counter8 on the stack
vpaddd seven(%rip), $TMP1, $CTR1
vmovdqa $CTR1, (%r11) # CTR8 = TAG[127...32][00..07]
vpaddd one(%rip), $TMP1, $CTR2 # CTR2 = TAG[127...32][00..01]
vpaddd two(%rip), $TMP1, $CTR3 # CTR3 = TAG[127...32][00..02]
vpaddd three(%rip), $TMP1, $CTR4 # CTR4 = TAG[127...32][00..03]
vpaddd four(%rip), $TMP1, $CTR5 # CTR5 = TAG[127...32][00..04]
vpaddd five(%rip), $TMP1, $CTR6 # CTR6 = TAG[127...32][00..05]
vpaddd six(%rip), $TMP1, $CTR7 # CTR7 = TAG[127...32][00..06]
vmovdqa $TMP1, $CTR1 # CTR1 = TAG[127...32][00..00]
shrq \$3, $LEN
jz .L256_enc_msg_x8_check_remainder
subq \$128, $CT
subq \$128, $PT
.L256_enc_msg_x8_loop1:
addq \$128, $CT
addq \$128, $PT
vmovdqa $CTR1, $STATE1
vmovdqa $CTR2, $STATE2
vmovdqa $CTR3, $STATE3
vmovdqa $CTR4, $STATE4
vmovdqa $CTR5, $STATE5
vmovdqa $CTR6, $STATE6
vmovdqa $CTR7, $STATE7
# move from stack
vmovdqa (%r11), $STATE8
vpxor ($KS), $STATE1, $STATE1
vpxor ($KS), $STATE2, $STATE2
vpxor ($KS), $STATE3, $STATE3
vpxor ($KS), $STATE4, $STATE4
vpxor ($KS), $STATE5, $STATE5
vpxor ($KS), $STATE6, $STATE6
vpxor ($KS), $STATE7, $STATE7
vpxor ($KS), $STATE8, $STATE8
${\$aes_round8->(1)}
vmovdqa (%r11), $CTR7 # deal with CTR8
vpaddd eight(%rip), $CTR7, $CTR7
vmovdqa $CTR7, (%r11)
${\$aes_round8->(2)}
vpsubd one(%rip), $CTR7, $CTR7
${\$aes_round8->(3)}
vpaddd eight(%rip), $CTR1, $CTR1
${\$aes_round8->(4)}
vpaddd eight(%rip), $CTR2, $CTR2
${\$aes_round8->(5)}
vpaddd eight(%rip), $CTR3, $CTR3
${\$aes_round8->(6)}
vpaddd eight(%rip), $CTR4, $CTR4
${\$aes_round8->(7)}
vpaddd eight(%rip), $CTR5, $CTR5
${\$aes_round8->(8)}
vpaddd eight(%rip), $CTR6, $CTR6
${\$aes_round8->(9)}
${\$aes_round8->(10)}
${\$aes_round8->(11)}
${\$aes_round8->(12)}
${\$aes_round8->(13)}
${\$aes_lastround8->(14)}
# XOR with Plaintext
vpxor 0*16($PT), $STATE1, $STATE1
vpxor 1*16($PT), $STATE2, $STATE2
vpxor 2*16($PT), $STATE3, $STATE3
vpxor 3*16($PT), $STATE4, $STATE4
vpxor 4*16($PT), $STATE5, $STATE5
vpxor 5*16($PT), $STATE6, $STATE6
vpxor 6*16($PT), $STATE7, $STATE7
vpxor 7*16($PT), $STATE8, $STATE8
subq \$1, $LEN
vmovdqu $STATE1, 0*16($CT)
vmovdqu $STATE2, 1*16($CT)
vmovdqu $STATE3, 2*16($CT)
vmovdqu $STATE4, 3*16($CT)
vmovdqu $STATE5, 4*16($CT)
vmovdqu $STATE6, 5*16($CT)
vmovdqu $STATE7, 6*16($CT)
vmovdqu $STATE8, 7*16($CT)
jne .L256_enc_msg_x8_loop1
addq \$128, $CT
addq \$128, $PT
.L256_enc_msg_x8_check_remainder:
cmpq \$0, %r10
je .L256_enc_msg_x8_out
.L256_enc_msg_x8_loop2:
# encrypt each block separately
# CTR1 is the highest counter (even if no LOOP done)
vmovdqa $CTR1, $STATE1
vpaddd one(%rip), $CTR1, $CTR1
vpxor ($KS), $STATE1, $STATE1
vaesenc 16($KS), $STATE1, $STATE1
vaesenc 32($KS), $STATE1, $STATE1
vaesenc 48($KS), $STATE1, $STATE1
vaesenc 64($KS), $STATE1, $STATE1
vaesenc 80($KS), $STATE1, $STATE1
vaesenc 96($KS), $STATE1, $STATE1
vaesenc 112($KS), $STATE1, $STATE1
vaesenc 128($KS), $STATE1, $STATE1
vaesenc 144($KS), $STATE1, $STATE1
vaesenc 160($KS), $STATE1, $STATE1
vaesenc 176($KS), $STATE1, $STATE1
vaesenc 192($KS), $STATE1, $STATE1
vaesenc 208($KS), $STATE1, $STATE1
vaesenclast 224($KS), $STATE1, $STATE1
# XOR with Plaintext
vpxor ($PT), $STATE1, $STATE1
vmovdqu $STATE1, ($CT)
addq \$16, $PT
addq \$16, $CT
subq \$1, %r10
jnz .L256_enc_msg_x8_loop2
.L256_enc_msg_x8_out:
ret
.cfi_endproc
.size aes256gcmsiv_enc_msg_x8,.-aes256gcmsiv_enc_msg_x8
___
}
aes256gcmsiv_enc_msg_x8();
aesgcmsiv_dec(1);
sub aes256gcmsiv_kdf {
my $ONE = "%xmm8";
my $BLOCK1 = "%xmm4";
my $BLOCK2 = "%xmm6";
my $BLOCK3 = "%xmm7";
my $BLOCK4 = "%xmm11";
my $BLOCK5 = "%xmm12";
my $BLOCK6 = "%xmm13";
my $enc_roundx6 = sub {
my ($i, $j) = @_;
return <<___;
vmovdqa ${\eval($i*16)}(%rdx), $j
vaesenc $j, $BLOCK1, $BLOCK1
vaesenc $j, $BLOCK2, $BLOCK2
vaesenc $j, $BLOCK3, $BLOCK3
vaesenc $j, $BLOCK4, $BLOCK4
vaesenc $j, $BLOCK5, $BLOCK5
vaesenc $j, $BLOCK6, $BLOCK6
___
};
my $enc_roundlastx6 = sub {
my ($i, $j) = @_;
return <<___;
vmovdqa ${\eval($i*16)}(%rdx), $j
vaesenclast $j, $BLOCK1, $BLOCK1
vaesenclast $j, $BLOCK2, $BLOCK2
vaesenclast $j, $BLOCK3, $BLOCK3
vaesenclast $j, $BLOCK4, $BLOCK4
vaesenclast $j, $BLOCK5, $BLOCK5
vaesenclast $j, $BLOCK6, $BLOCK6
___
};
# void aes256gcmsiv_kdf(const uint8_t nonce[16],
# uint8_t *out_key_material,
# const uint8_t *key_schedule);
$code.=<<___;
.globl aes256gcmsiv_kdf
.type aes256gcmsiv_kdf,\@function,3
.align 16
aes256gcmsiv_kdf:
.cfi_startproc
# parameter 1: %rdi Pointer to NONCE
# parameter 2: %rsi Pointer to CT
# parameter 4: %rdx Pointer to keys
vmovdqa (%rdx), %xmm1 # xmm1 = first 16 bytes of random key
vmovdqa 0*16(%rdi), $BLOCK1
vmovdqa and_mask(%rip), $BLOCK4
vmovdqa one(%rip), $ONE
vpshufd \$0x90, $BLOCK1, $BLOCK1
vpand $BLOCK4, $BLOCK1, $BLOCK1
vpaddd $ONE, $BLOCK1, $BLOCK2
vpaddd $ONE, $BLOCK2, $BLOCK3
vpaddd $ONE, $BLOCK3, $BLOCK4
vpaddd $ONE, $BLOCK4, $BLOCK5
vpaddd $ONE, $BLOCK5, $BLOCK6
vpxor %xmm1, $BLOCK1, $BLOCK1
vpxor %xmm1, $BLOCK2, $BLOCK2
vpxor %xmm1, $BLOCK3, $BLOCK3
vpxor %xmm1, $BLOCK4, $BLOCK4
vpxor %xmm1, $BLOCK5, $BLOCK5
vpxor %xmm1, $BLOCK6, $BLOCK6
${\$enc_roundx6->(1, "%xmm1")}
${\$enc_roundx6->(2, "%xmm2")}
${\$enc_roundx6->(3, "%xmm1")}
${\$enc_roundx6->(4, "%xmm2")}
${\$enc_roundx6->(5, "%xmm1")}
${\$enc_roundx6->(6, "%xmm2")}
${\$enc_roundx6->(7, "%xmm1")}
${\$enc_roundx6->(8, "%xmm2")}
${\$enc_roundx6->(9, "%xmm1")}
${\$enc_roundx6->(10, "%xmm2")}
${\$enc_roundx6->(11, "%xmm1")}
${\$enc_roundx6->(12, "%xmm2")}
${\$enc_roundx6->(13, "%xmm1")}
${\$enc_roundlastx6->(14, "%xmm2")}
vmovdqa $BLOCK1, 0*16(%rsi)
vmovdqa $BLOCK2, 1*16(%rsi)
vmovdqa $BLOCK3, 2*16(%rsi)
vmovdqa $BLOCK4, 3*16(%rsi)
vmovdqa $BLOCK5, 4*16(%rsi)
vmovdqa $BLOCK6, 5*16(%rsi)
ret
.cfi_endproc
.size aes256gcmsiv_kdf, .-aes256gcmsiv_kdf
___
}
aes256gcmsiv_kdf();
print $code;
close STDOUT;