Remove if'd-out OCB-AES assembly
BoringSSL never shipped the OCB-AES assembly, but took two different
strategies in disabling it for x86 versus x86_64. For x86, the
implementation was deleted, but for x86_64 it was wrapped in `if(0)`.
Since we're no longer as concerned about keeping the assembly from
diverging from upstream, be consistent in how the OCB-AES functions
are removed from both by deleting them from x86_64.
Change-Id: I5233134e3e131fed56f365ed6f43f30c39dd2e33
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/56989
Reviewed-by: David Benjamin <davidben@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/aes/asm/aesni-x86_64.pl b/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
index 2abc8d0..320760a 100644
--- a/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
+++ b/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
@@ -2764,955 +2764,6 @@
.cfi_endproc
.size ${PREFIX}_xts_decrypt,.-${PREFIX}_xts_decrypt
___
-}
-
-######################################################################
-# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
-# const AES_KEY *key, unsigned int start_block_num,
-# unsigned char offset_i[16], const unsigned char L_[][16],
-# unsigned char checksum[16]);
-#
-if (0) { # Omit these functions in BoringSSL
-my @offset=map("%xmm$_",(10..15));
-my ($checksum,$rndkey0l)=("%xmm8","%xmm9");
-my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments
-my ($L_p,$checksum_p) = ("%rbx","%rbp");
-my ($i1,$i3,$i5) = ("%r12","%r13","%r14");
-my $seventh_arg = $win64 ? 56 : 8;
-my $blocks = $len;
-
-$code.=<<___;
-.globl ${PREFIX}_ocb_encrypt
-.type ${PREFIX}_ocb_encrypt,\@function,6
-.align 32
-${PREFIX}_ocb_encrypt:
-.cfi_startproc
- lea (%rsp),%rax
- push %rbx
-.cfi_push %rbx
- push %rbp
-.cfi_push %rbp
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
-___
-$code.=<<___ if ($win64);
- lea -0xa0(%rsp),%rsp
- movaps %xmm6,0x00(%rsp) # offload everything
- movaps %xmm7,0x10(%rsp)
- movaps %xmm8,0x20(%rsp)
- movaps %xmm9,0x30(%rsp)
- movaps %xmm10,0x40(%rsp)
- movaps %xmm11,0x50(%rsp)
- movaps %xmm12,0x60(%rsp)
- movaps %xmm13,0x70(%rsp)
- movaps %xmm14,0x80(%rsp)
- movaps %xmm15,0x90(%rsp)
-.Locb_enc_body:
-___
-$code.=<<___;
- mov $seventh_arg(%rax),$L_p # 7th argument
- mov $seventh_arg+8(%rax),$checksum_p# 8th argument
-
- mov 240($key),$rnds_
- mov $key,$key_
- shl \$4,$rnds_
- $movkey ($key),$rndkey0l # round[0]
- $movkey 16($key,$rnds_),$rndkey1 # round[last]
-
- movdqu ($offset_p),@offset[5] # load last offset_i
- pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
- pxor $rndkey1,@offset[5] # offset_i ^ round[last]
-
- mov \$16+32,$rounds
- lea 32($key_,$rnds_),$key
- $movkey 16($key_),$rndkey1 # round[1]
- sub %r10,%rax # twisted $rounds
- mov %rax,%r10 # backup twisted $rounds
-
- movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
- movdqu ($checksum_p),$checksum # load checksum
-
- test \$1,$block_num # is first block number odd?
- jnz .Locb_enc_odd
-
- bsf $block_num,$i1
- add \$1,$block_num
- shl \$4,$i1
- movdqu ($L_p,$i1),$inout5 # borrow
- movdqu ($inp),$inout0
- lea 16($inp),$inp
-
- call __ocb_encrypt1
-
- movdqa $inout5,@offset[5]
- movups $inout0,($out)
- lea 16($out),$out
- sub \$1,$blocks
- jz .Locb_enc_done
-
-.Locb_enc_odd:
- lea 1($block_num),$i1 # even-numbered blocks
- lea 3($block_num),$i3
- lea 5($block_num),$i5
- lea 6($block_num),$block_num
- bsf $i1,$i1 # ntz(block)
- bsf $i3,$i3
- bsf $i5,$i5
- shl \$4,$i1 # ntz(block) -> table offset
- shl \$4,$i3
- shl \$4,$i5
-
- sub \$6,$blocks
- jc .Locb_enc_short
- jmp .Locb_enc_grandloop
-
-.align 32
-.Locb_enc_grandloop:
- movdqu `16*0`($inp),$inout0 # load input
- movdqu `16*1`($inp),$inout1
- movdqu `16*2`($inp),$inout2
- movdqu `16*3`($inp),$inout3
- movdqu `16*4`($inp),$inout4
- movdqu `16*5`($inp),$inout5
- lea `16*6`($inp),$inp
-
- call __ocb_encrypt6
-
- movups $inout0,`16*0`($out) # store output
- movups $inout1,`16*1`($out)
- movups $inout2,`16*2`($out)
- movups $inout3,`16*3`($out)
- movups $inout4,`16*4`($out)
- movups $inout5,`16*5`($out)
- lea `16*6`($out),$out
- sub \$6,$blocks
- jnc .Locb_enc_grandloop
-
-.Locb_enc_short:
- add \$6,$blocks
- jz .Locb_enc_done
-
- movdqu `16*0`($inp),$inout0
- cmp \$2,$blocks
- jb .Locb_enc_one
- movdqu `16*1`($inp),$inout1
- je .Locb_enc_two
-
- movdqu `16*2`($inp),$inout2
- cmp \$4,$blocks
- jb .Locb_enc_three
- movdqu `16*3`($inp),$inout3
- je .Locb_enc_four
-
- movdqu `16*4`($inp),$inout4
- pxor $inout5,$inout5
-
- call __ocb_encrypt6
-
- movdqa @offset[4],@offset[5]
- movups $inout0,`16*0`($out)
- movups $inout1,`16*1`($out)
- movups $inout2,`16*2`($out)
- movups $inout3,`16*3`($out)
- movups $inout4,`16*4`($out)
-
- jmp .Locb_enc_done
-
-.align 16
-.Locb_enc_one:
- movdqa @offset[0],$inout5 # borrow
-
- call __ocb_encrypt1
-
- movdqa $inout5,@offset[5]
- movups $inout0,`16*0`($out)
- jmp .Locb_enc_done
-
-.align 16
-.Locb_enc_two:
- pxor $inout2,$inout2
- pxor $inout3,$inout3
-
- call __ocb_encrypt4
-
- movdqa @offset[1],@offset[5]
- movups $inout0,`16*0`($out)
- movups $inout1,`16*1`($out)
-
- jmp .Locb_enc_done
-
-.align 16
-.Locb_enc_three:
- pxor $inout3,$inout3
-
- call __ocb_encrypt4
-
- movdqa @offset[2],@offset[5]
- movups $inout0,`16*0`($out)
- movups $inout1,`16*1`($out)
- movups $inout2,`16*2`($out)
-
- jmp .Locb_enc_done
-
-.align 16
-.Locb_enc_four:
- call __ocb_encrypt4
-
- movdqa @offset[3],@offset[5]
- movups $inout0,`16*0`($out)
- movups $inout1,`16*1`($out)
- movups $inout2,`16*2`($out)
- movups $inout3,`16*3`($out)
-
-.Locb_enc_done:
- pxor $rndkey0,@offset[5] # "remove" round[last]
- movdqu $checksum,($checksum_p) # store checksum
- movdqu @offset[5],($offset_p) # store last offset_i
-
- xorps %xmm0,%xmm0 # clear register bank
- pxor %xmm1,%xmm1
- pxor %xmm2,%xmm2
- pxor %xmm3,%xmm3
- pxor %xmm4,%xmm4
- pxor %xmm5,%xmm5
-___
-$code.=<<___ if (!$win64);
- pxor %xmm6,%xmm6
- pxor %xmm7,%xmm7
- pxor %xmm8,%xmm8
- pxor %xmm9,%xmm9
- pxor %xmm10,%xmm10
- pxor %xmm11,%xmm11
- pxor %xmm12,%xmm12
- pxor %xmm13,%xmm13
- pxor %xmm14,%xmm14
- pxor %xmm15,%xmm15
- lea 0x28(%rsp),%rax
-.cfi_def_cfa %rax,8
-___
-$code.=<<___ if ($win64);
- movaps 0x00(%rsp),%xmm6
- movaps %xmm0,0x00(%rsp) # clear stack
- movaps 0x10(%rsp),%xmm7
- movaps %xmm0,0x10(%rsp)
- movaps 0x20(%rsp),%xmm8
- movaps %xmm0,0x20(%rsp)
- movaps 0x30(%rsp),%xmm9
- movaps %xmm0,0x30(%rsp)
- movaps 0x40(%rsp),%xmm10
- movaps %xmm0,0x40(%rsp)
- movaps 0x50(%rsp),%xmm11
- movaps %xmm0,0x50(%rsp)
- movaps 0x60(%rsp),%xmm12
- movaps %xmm0,0x60(%rsp)
- movaps 0x70(%rsp),%xmm13
- movaps %xmm0,0x70(%rsp)
- movaps 0x80(%rsp),%xmm14
- movaps %xmm0,0x80(%rsp)
- movaps 0x90(%rsp),%xmm15
- movaps %xmm0,0x90(%rsp)
- lea 0xa0+0x28(%rsp),%rax
-.Locb_enc_pop:
-___
-$code.=<<___;
- mov -40(%rax),%r14
-.cfi_restore %r14
- mov -32(%rax),%r13
-.cfi_restore %r13
- mov -24(%rax),%r12
-.cfi_restore %r12
- mov -16(%rax),%rbp
-.cfi_restore %rbp
- mov -8(%rax),%rbx
-.cfi_restore %rbx
- lea (%rax),%rsp
-.cfi_def_cfa_register %rsp
-.Locb_enc_epilogue:
- ret
-.cfi_endproc
-.size ${PREFIX}_ocb_encrypt,.-${PREFIX}_ocb_encrypt
-
-.type __ocb_encrypt6,\@abi-omnipotent
-.align 32
-__ocb_encrypt6:
- pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
- movdqu ($L_p,$i1),@offset[1]
- movdqa @offset[0],@offset[2]
- movdqu ($L_p,$i3),@offset[3]
- movdqa @offset[0],@offset[4]
- pxor @offset[5],@offset[0]
- movdqu ($L_p,$i5),@offset[5]
- pxor @offset[0],@offset[1]
- pxor $inout0,$checksum # accumulate checksum
- pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
- pxor @offset[1],@offset[2]
- pxor $inout1,$checksum
- pxor @offset[1],$inout1
- pxor @offset[2],@offset[3]
- pxor $inout2,$checksum
- pxor @offset[2],$inout2
- pxor @offset[3],@offset[4]
- pxor $inout3,$checksum
- pxor @offset[3],$inout3
- pxor @offset[4],@offset[5]
- pxor $inout4,$checksum
- pxor @offset[4],$inout4
- pxor $inout5,$checksum
- pxor @offset[5],$inout5
- $movkey 32($key_),$rndkey0
-
- lea 1($block_num),$i1 # even-numbered blocks
- lea 3($block_num),$i3
- lea 5($block_num),$i5
- add \$6,$block_num
- pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
- bsf $i1,$i1 # ntz(block)
- bsf $i3,$i3
- bsf $i5,$i5
-
- aesenc $rndkey1,$inout0
- aesenc $rndkey1,$inout1
- aesenc $rndkey1,$inout2
- aesenc $rndkey1,$inout3
- pxor $rndkey0l,@offset[1]
- pxor $rndkey0l,@offset[2]
- aesenc $rndkey1,$inout4
- pxor $rndkey0l,@offset[3]
- pxor $rndkey0l,@offset[4]
- aesenc $rndkey1,$inout5
- $movkey 48($key_),$rndkey1
- pxor $rndkey0l,@offset[5]
-
- aesenc $rndkey0,$inout0
- aesenc $rndkey0,$inout1
- aesenc $rndkey0,$inout2
- aesenc $rndkey0,$inout3
- aesenc $rndkey0,$inout4
- aesenc $rndkey0,$inout5
- $movkey 64($key_),$rndkey0
- shl \$4,$i1 # ntz(block) -> table offset
- shl \$4,$i3
- jmp .Locb_enc_loop6
-
-.align 32
-.Locb_enc_loop6:
- aesenc $rndkey1,$inout0
- aesenc $rndkey1,$inout1
- aesenc $rndkey1,$inout2
- aesenc $rndkey1,$inout3
- aesenc $rndkey1,$inout4
- aesenc $rndkey1,$inout5
- $movkey ($key,%rax),$rndkey1
- add \$32,%rax
-
- aesenc $rndkey0,$inout0
- aesenc $rndkey0,$inout1
- aesenc $rndkey0,$inout2
- aesenc $rndkey0,$inout3
- aesenc $rndkey0,$inout4
- aesenc $rndkey0,$inout5
- $movkey -16($key,%rax),$rndkey0
- jnz .Locb_enc_loop6
-
- aesenc $rndkey1,$inout0
- aesenc $rndkey1,$inout1
- aesenc $rndkey1,$inout2
- aesenc $rndkey1,$inout3
- aesenc $rndkey1,$inout4
- aesenc $rndkey1,$inout5
- $movkey 16($key_),$rndkey1
- shl \$4,$i5
-
- aesenclast @offset[0],$inout0
- movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
- mov %r10,%rax # restore twisted rounds
- aesenclast @offset[1],$inout1
- aesenclast @offset[2],$inout2
- aesenclast @offset[3],$inout3
- aesenclast @offset[4],$inout4
- aesenclast @offset[5],$inout5
- ret
-.size __ocb_encrypt6,.-__ocb_encrypt6
-
-.type __ocb_encrypt4,\@abi-omnipotent
-.align 32
-__ocb_encrypt4:
- pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
- movdqu ($L_p,$i1),@offset[1]
- movdqa @offset[0],@offset[2]
- movdqu ($L_p,$i3),@offset[3]
- pxor @offset[5],@offset[0]
- pxor @offset[0],@offset[1]
- pxor $inout0,$checksum # accumulate checksum
- pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
- pxor @offset[1],@offset[2]
- pxor $inout1,$checksum
- pxor @offset[1],$inout1
- pxor @offset[2],@offset[3]
- pxor $inout2,$checksum
- pxor @offset[2],$inout2
- pxor $inout3,$checksum
- pxor @offset[3],$inout3
- $movkey 32($key_),$rndkey0
-
- pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
- pxor $rndkey0l,@offset[1]
- pxor $rndkey0l,@offset[2]
- pxor $rndkey0l,@offset[3]
-
- aesenc $rndkey1,$inout0
- aesenc $rndkey1,$inout1
- aesenc $rndkey1,$inout2
- aesenc $rndkey1,$inout3
- $movkey 48($key_),$rndkey1
-
- aesenc $rndkey0,$inout0
- aesenc $rndkey0,$inout1
- aesenc $rndkey0,$inout2
- aesenc $rndkey0,$inout3
- $movkey 64($key_),$rndkey0
- jmp .Locb_enc_loop4
-
-.align 32
-.Locb_enc_loop4:
- aesenc $rndkey1,$inout0
- aesenc $rndkey1,$inout1
- aesenc $rndkey1,$inout2
- aesenc $rndkey1,$inout3
- $movkey ($key,%rax),$rndkey1
- add \$32,%rax
-
- aesenc $rndkey0,$inout0
- aesenc $rndkey0,$inout1
- aesenc $rndkey0,$inout2
- aesenc $rndkey0,$inout3
- $movkey -16($key,%rax),$rndkey0
- jnz .Locb_enc_loop4
-
- aesenc $rndkey1,$inout0
- aesenc $rndkey1,$inout1
- aesenc $rndkey1,$inout2
- aesenc $rndkey1,$inout3
- $movkey 16($key_),$rndkey1
- mov %r10,%rax # restore twisted rounds
-
- aesenclast @offset[0],$inout0
- aesenclast @offset[1],$inout1
- aesenclast @offset[2],$inout2
- aesenclast @offset[3],$inout3
- ret
-.size __ocb_encrypt4,.-__ocb_encrypt4
-
-.type __ocb_encrypt1,\@abi-omnipotent
-.align 32
-__ocb_encrypt1:
- pxor @offset[5],$inout5 # offset_i
- pxor $rndkey0l,$inout5 # offset_i ^ round[0]
- pxor $inout0,$checksum # accumulate checksum
- pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
- $movkey 32($key_),$rndkey0
-
- aesenc $rndkey1,$inout0
- $movkey 48($key_),$rndkey1
- pxor $rndkey0l,$inout5 # offset_i ^ round[last]
-
- aesenc $rndkey0,$inout0
- $movkey 64($key_),$rndkey0
- jmp .Locb_enc_loop1
-
-.align 32
-.Locb_enc_loop1:
- aesenc $rndkey1,$inout0
- $movkey ($key,%rax),$rndkey1
- add \$32,%rax
-
- aesenc $rndkey0,$inout0
- $movkey -16($key,%rax),$rndkey0
- jnz .Locb_enc_loop1
-
- aesenc $rndkey1,$inout0
- $movkey 16($key_),$rndkey1 # redundant in tail
- mov %r10,%rax # restore twisted rounds
-
- aesenclast $inout5,$inout0
- ret
-.size __ocb_encrypt1,.-__ocb_encrypt1
-
-.globl ${PREFIX}_ocb_decrypt
-.type ${PREFIX}_ocb_decrypt,\@function,6
-.align 32
-${PREFIX}_ocb_decrypt:
-.cfi_startproc
- lea (%rsp),%rax
- push %rbx
-.cfi_push %rbx
- push %rbp
-.cfi_push %rbp
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
-___
-$code.=<<___ if ($win64);
- lea -0xa0(%rsp),%rsp
- movaps %xmm6,0x00(%rsp) # offload everything
- movaps %xmm7,0x10(%rsp)
- movaps %xmm8,0x20(%rsp)
- movaps %xmm9,0x30(%rsp)
- movaps %xmm10,0x40(%rsp)
- movaps %xmm11,0x50(%rsp)
- movaps %xmm12,0x60(%rsp)
- movaps %xmm13,0x70(%rsp)
- movaps %xmm14,0x80(%rsp)
- movaps %xmm15,0x90(%rsp)
-.Locb_dec_body:
-___
-$code.=<<___;
- mov $seventh_arg(%rax),$L_p # 7th argument
- mov $seventh_arg+8(%rax),$checksum_p# 8th argument
-
- mov 240($key),$rnds_
- mov $key,$key_
- shl \$4,$rnds_
- $movkey ($key),$rndkey0l # round[0]
- $movkey 16($key,$rnds_),$rndkey1 # round[last]
-
- movdqu ($offset_p),@offset[5] # load last offset_i
- pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
- pxor $rndkey1,@offset[5] # offset_i ^ round[last]
-
- mov \$16+32,$rounds
- lea 32($key_,$rnds_),$key
- $movkey 16($key_),$rndkey1 # round[1]
- sub %r10,%rax # twisted $rounds
- mov %rax,%r10 # backup twisted $rounds
-
- movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
- movdqu ($checksum_p),$checksum # load checksum
-
- test \$1,$block_num # is first block number odd?
- jnz .Locb_dec_odd
-
- bsf $block_num,$i1
- add \$1,$block_num
- shl \$4,$i1
- movdqu ($L_p,$i1),$inout5 # borrow
- movdqu ($inp),$inout0
- lea 16($inp),$inp
-
- call __ocb_decrypt1
-
- movdqa $inout5,@offset[5]
- movups $inout0,($out)
- xorps $inout0,$checksum # accumulate checksum
- lea 16($out),$out
- sub \$1,$blocks
- jz .Locb_dec_done
-
-.Locb_dec_odd:
- lea 1($block_num),$i1 # even-numbered blocks
- lea 3($block_num),$i3
- lea 5($block_num),$i5
- lea 6($block_num),$block_num
- bsf $i1,$i1 # ntz(block)
- bsf $i3,$i3
- bsf $i5,$i5
- shl \$4,$i1 # ntz(block) -> table offset
- shl \$4,$i3
- shl \$4,$i5
-
- sub \$6,$blocks
- jc .Locb_dec_short
- jmp .Locb_dec_grandloop
-
-.align 32
-.Locb_dec_grandloop:
- movdqu `16*0`($inp),$inout0 # load input
- movdqu `16*1`($inp),$inout1
- movdqu `16*2`($inp),$inout2
- movdqu `16*3`($inp),$inout3
- movdqu `16*4`($inp),$inout4
- movdqu `16*5`($inp),$inout5
- lea `16*6`($inp),$inp
-
- call __ocb_decrypt6
-
- movups $inout0,`16*0`($out) # store output
- pxor $inout0,$checksum # accumulate checksum
- movups $inout1,`16*1`($out)
- pxor $inout1,$checksum
- movups $inout2,`16*2`($out)
- pxor $inout2,$checksum
- movups $inout3,`16*3`($out)
- pxor $inout3,$checksum
- movups $inout4,`16*4`($out)
- pxor $inout4,$checksum
- movups $inout5,`16*5`($out)
- pxor $inout5,$checksum
- lea `16*6`($out),$out
- sub \$6,$blocks
- jnc .Locb_dec_grandloop
-
-.Locb_dec_short:
- add \$6,$blocks
- jz .Locb_dec_done
-
- movdqu `16*0`($inp),$inout0
- cmp \$2,$blocks
- jb .Locb_dec_one
- movdqu `16*1`($inp),$inout1
- je .Locb_dec_two
-
- movdqu `16*2`($inp),$inout2
- cmp \$4,$blocks
- jb .Locb_dec_three
- movdqu `16*3`($inp),$inout3
- je .Locb_dec_four
-
- movdqu `16*4`($inp),$inout4
- pxor $inout5,$inout5
-
- call __ocb_decrypt6
-
- movdqa @offset[4],@offset[5]
- movups $inout0,`16*0`($out) # store output
- pxor $inout0,$checksum # accumulate checksum
- movups $inout1,`16*1`($out)
- pxor $inout1,$checksum
- movups $inout2,`16*2`($out)
- pxor $inout2,$checksum
- movups $inout3,`16*3`($out)
- pxor $inout3,$checksum
- movups $inout4,`16*4`($out)
- pxor $inout4,$checksum
-
- jmp .Locb_dec_done
-
-.align 16
-.Locb_dec_one:
- movdqa @offset[0],$inout5 # borrow
-
- call __ocb_decrypt1
-
- movdqa $inout5,@offset[5]
- movups $inout0,`16*0`($out) # store output
- xorps $inout0,$checksum # accumulate checksum
- jmp .Locb_dec_done
-
-.align 16
-.Locb_dec_two:
- pxor $inout2,$inout2
- pxor $inout3,$inout3
-
- call __ocb_decrypt4
-
- movdqa @offset[1],@offset[5]
- movups $inout0,`16*0`($out) # store output
- xorps $inout0,$checksum # accumulate checksum
- movups $inout1,`16*1`($out)
- xorps $inout1,$checksum
-
- jmp .Locb_dec_done
-
-.align 16
-.Locb_dec_three:
- pxor $inout3,$inout3
-
- call __ocb_decrypt4
-
- movdqa @offset[2],@offset[5]
- movups $inout0,`16*0`($out) # store output
- xorps $inout0,$checksum # accumulate checksum
- movups $inout1,`16*1`($out)
- xorps $inout1,$checksum
- movups $inout2,`16*2`($out)
- xorps $inout2,$checksum
-
- jmp .Locb_dec_done
-
-.align 16
-.Locb_dec_four:
- call __ocb_decrypt4
-
- movdqa @offset[3],@offset[5]
- movups $inout0,`16*0`($out) # store output
- pxor $inout0,$checksum # accumulate checksum
- movups $inout1,`16*1`($out)
- pxor $inout1,$checksum
- movups $inout2,`16*2`($out)
- pxor $inout2,$checksum
- movups $inout3,`16*3`($out)
- pxor $inout3,$checksum
-
-.Locb_dec_done:
- pxor $rndkey0,@offset[5] # "remove" round[last]
- movdqu $checksum,($checksum_p) # store checksum
- movdqu @offset[5],($offset_p) # store last offset_i
-
- xorps %xmm0,%xmm0 # clear register bank
- pxor %xmm1,%xmm1
- pxor %xmm2,%xmm2
- pxor %xmm3,%xmm3
- pxor %xmm4,%xmm4
- pxor %xmm5,%xmm5
-___
-$code.=<<___ if (!$win64);
- pxor %xmm6,%xmm6
- pxor %xmm7,%xmm7
- pxor %xmm8,%xmm8
- pxor %xmm9,%xmm9
- pxor %xmm10,%xmm10
- pxor %xmm11,%xmm11
- pxor %xmm12,%xmm12
- pxor %xmm13,%xmm13
- pxor %xmm14,%xmm14
- pxor %xmm15,%xmm15
- lea 0x28(%rsp),%rax
-.cfi_def_cfa %rax,8
-___
-$code.=<<___ if ($win64);
- movaps 0x00(%rsp),%xmm6
- movaps %xmm0,0x00(%rsp) # clear stack
- movaps 0x10(%rsp),%xmm7
- movaps %xmm0,0x10(%rsp)
- movaps 0x20(%rsp),%xmm8
- movaps %xmm0,0x20(%rsp)
- movaps 0x30(%rsp),%xmm9
- movaps %xmm0,0x30(%rsp)
- movaps 0x40(%rsp),%xmm10
- movaps %xmm0,0x40(%rsp)
- movaps 0x50(%rsp),%xmm11
- movaps %xmm0,0x50(%rsp)
- movaps 0x60(%rsp),%xmm12
- movaps %xmm0,0x60(%rsp)
- movaps 0x70(%rsp),%xmm13
- movaps %xmm0,0x70(%rsp)
- movaps 0x80(%rsp),%xmm14
- movaps %xmm0,0x80(%rsp)
- movaps 0x90(%rsp),%xmm15
- movaps %xmm0,0x90(%rsp)
- lea 0xa0+0x28(%rsp),%rax
-.Locb_dec_pop:
-___
-$code.=<<___;
- mov -40(%rax),%r14
-.cfi_restore %r14
- mov -32(%rax),%r13
-.cfi_restore %r13
- mov -24(%rax),%r12
-.cfi_restore %r12
- mov -16(%rax),%rbp
-.cfi_restore %rbp
- mov -8(%rax),%rbx
-.cfi_restore %rbx
- lea (%rax),%rsp
-.cfi_def_cfa_register %rsp
-.Locb_dec_epilogue:
- ret
-.cfi_endproc
-.size ${PREFIX}_ocb_decrypt,.-${PREFIX}_ocb_decrypt
-
-.type __ocb_decrypt6,\@abi-omnipotent
-.align 32
-__ocb_decrypt6:
- pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
- movdqu ($L_p,$i1),@offset[1]
- movdqa @offset[0],@offset[2]
- movdqu ($L_p,$i3),@offset[3]
- movdqa @offset[0],@offset[4]
- pxor @offset[5],@offset[0]
- movdqu ($L_p,$i5),@offset[5]
- pxor @offset[0],@offset[1]
- pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
- pxor @offset[1],@offset[2]
- pxor @offset[1],$inout1
- pxor @offset[2],@offset[3]
- pxor @offset[2],$inout2
- pxor @offset[3],@offset[4]
- pxor @offset[3],$inout3
- pxor @offset[4],@offset[5]
- pxor @offset[4],$inout4
- pxor @offset[5],$inout5
- $movkey 32($key_),$rndkey0
-
- lea 1($block_num),$i1 # even-numbered blocks
- lea 3($block_num),$i3
- lea 5($block_num),$i5
- add \$6,$block_num
- pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
- bsf $i1,$i1 # ntz(block)
- bsf $i3,$i3
- bsf $i5,$i5
-
- aesdec $rndkey1,$inout0
- aesdec $rndkey1,$inout1
- aesdec $rndkey1,$inout2
- aesdec $rndkey1,$inout3
- pxor $rndkey0l,@offset[1]
- pxor $rndkey0l,@offset[2]
- aesdec $rndkey1,$inout4
- pxor $rndkey0l,@offset[3]
- pxor $rndkey0l,@offset[4]
- aesdec $rndkey1,$inout5
- $movkey 48($key_),$rndkey1
- pxor $rndkey0l,@offset[5]
-
- aesdec $rndkey0,$inout0
- aesdec $rndkey0,$inout1
- aesdec $rndkey0,$inout2
- aesdec $rndkey0,$inout3
- aesdec $rndkey0,$inout4
- aesdec $rndkey0,$inout5
- $movkey 64($key_),$rndkey0
- shl \$4,$i1 # ntz(block) -> table offset
- shl \$4,$i3
- jmp .Locb_dec_loop6
-
-.align 32
-.Locb_dec_loop6:
- aesdec $rndkey1,$inout0
- aesdec $rndkey1,$inout1
- aesdec $rndkey1,$inout2
- aesdec $rndkey1,$inout3
- aesdec $rndkey1,$inout4
- aesdec $rndkey1,$inout5
- $movkey ($key,%rax),$rndkey1
- add \$32,%rax
-
- aesdec $rndkey0,$inout0
- aesdec $rndkey0,$inout1
- aesdec $rndkey0,$inout2
- aesdec $rndkey0,$inout3
- aesdec $rndkey0,$inout4
- aesdec $rndkey0,$inout5
- $movkey -16($key,%rax),$rndkey0
- jnz .Locb_dec_loop6
-
- aesdec $rndkey1,$inout0
- aesdec $rndkey1,$inout1
- aesdec $rndkey1,$inout2
- aesdec $rndkey1,$inout3
- aesdec $rndkey1,$inout4
- aesdec $rndkey1,$inout5
- $movkey 16($key_),$rndkey1
- shl \$4,$i5
-
- aesdeclast @offset[0],$inout0
- movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
- mov %r10,%rax # restore twisted rounds
- aesdeclast @offset[1],$inout1
- aesdeclast @offset[2],$inout2
- aesdeclast @offset[3],$inout3
- aesdeclast @offset[4],$inout4
- aesdeclast @offset[5],$inout5
- ret
-.size __ocb_decrypt6,.-__ocb_decrypt6
-
-.type __ocb_decrypt4,\@abi-omnipotent
-.align 32
-__ocb_decrypt4:
- pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
- movdqu ($L_p,$i1),@offset[1]
- movdqa @offset[0],@offset[2]
- movdqu ($L_p,$i3),@offset[3]
- pxor @offset[5],@offset[0]
- pxor @offset[0],@offset[1]
- pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
- pxor @offset[1],@offset[2]
- pxor @offset[1],$inout1
- pxor @offset[2],@offset[3]
- pxor @offset[2],$inout2
- pxor @offset[3],$inout3
- $movkey 32($key_),$rndkey0
-
- pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
- pxor $rndkey0l,@offset[1]
- pxor $rndkey0l,@offset[2]
- pxor $rndkey0l,@offset[3]
-
- aesdec $rndkey1,$inout0
- aesdec $rndkey1,$inout1
- aesdec $rndkey1,$inout2
- aesdec $rndkey1,$inout3
- $movkey 48($key_),$rndkey1
-
- aesdec $rndkey0,$inout0
- aesdec $rndkey0,$inout1
- aesdec $rndkey0,$inout2
- aesdec $rndkey0,$inout3
- $movkey 64($key_),$rndkey0
- jmp .Locb_dec_loop4
-
-.align 32
-.Locb_dec_loop4:
- aesdec $rndkey1,$inout0
- aesdec $rndkey1,$inout1
- aesdec $rndkey1,$inout2
- aesdec $rndkey1,$inout3
- $movkey ($key,%rax),$rndkey1
- add \$32,%rax
-
- aesdec $rndkey0,$inout0
- aesdec $rndkey0,$inout1
- aesdec $rndkey0,$inout2
- aesdec $rndkey0,$inout3
- $movkey -16($key,%rax),$rndkey0
- jnz .Locb_dec_loop4
-
- aesdec $rndkey1,$inout0
- aesdec $rndkey1,$inout1
- aesdec $rndkey1,$inout2
- aesdec $rndkey1,$inout3
- $movkey 16($key_),$rndkey1
- mov %r10,%rax # restore twisted rounds
-
- aesdeclast @offset[0],$inout0
- aesdeclast @offset[1],$inout1
- aesdeclast @offset[2],$inout2
- aesdeclast @offset[3],$inout3
- ret
-.size __ocb_decrypt4,.-__ocb_decrypt4
-
-.type __ocb_decrypt1,\@abi-omnipotent
-.align 32
-__ocb_decrypt1:
- pxor @offset[5],$inout5 # offset_i
- pxor $rndkey0l,$inout5 # offset_i ^ round[0]
- pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
- $movkey 32($key_),$rndkey0
-
- aesdec $rndkey1,$inout0
- $movkey 48($key_),$rndkey1
- pxor $rndkey0l,$inout5 # offset_i ^ round[last]
-
- aesdec $rndkey0,$inout0
- $movkey 64($key_),$rndkey0
- jmp .Locb_dec_loop1
-
-.align 32
-.Locb_dec_loop1:
- aesdec $rndkey1,$inout0
- $movkey ($key,%rax),$rndkey1
- add \$32,%rax
-
- aesdec $rndkey0,$inout0
- $movkey -16($key,%rax),$rndkey0
- jnz .Locb_dec_loop1
-
- aesdec $rndkey1,$inout0
- $movkey 16($key_),$rndkey1 # redundant in tail
- mov %r10,%rax # restore twisted rounds
-
- aesdeclast $inout5,$inout0
- ret
-.size __ocb_decrypt1,.-__ocb_decrypt1
-___
} }}
########################################################################