Remove if'd-out OCB-AES assembly

BoringSSL never shipped the OCB-AES assembly, but took two different
strategies in disabling it for x86 versus x86_64. For x86, the
implementation was deleted, but for x86_64 it was wrapped in `if(0)`.

Since we're no longer as concerned about keeping the assembly from
diverging from upstream, be consistent in how the OCB-AES functions
are removed from both by deleting them from x86_64.

Change-Id: I5233134e3e131fed56f365ed6f43f30c39dd2e33
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/56989
Reviewed-by: David Benjamin <davidben@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/aes/asm/aesni-x86_64.pl b/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
index 2abc8d0..320760a 100644
--- a/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
+++ b/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
@@ -2764,955 +2764,6 @@
 .cfi_endproc
 .size	${PREFIX}_xts_decrypt,.-${PREFIX}_xts_decrypt
 ___
-}
-
-######################################################################
-# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
-#	const AES_KEY *key, unsigned int start_block_num,
-#	unsigned char offset_i[16], const unsigned char L_[][16],
-#	unsigned char checksum[16]);
-#
-if (0) {  # Omit these functions in BoringSSL
-my @offset=map("%xmm$_",(10..15));
-my ($checksum,$rndkey0l)=("%xmm8","%xmm9");
-my ($block_num,$offset_p)=("%r8","%r9");		# 5th and 6th arguments
-my ($L_p,$checksum_p) = ("%rbx","%rbp");
-my ($i1,$i3,$i5) = ("%r12","%r13","%r14");
-my $seventh_arg = $win64 ? 56 : 8;
-my $blocks = $len;
-
-$code.=<<___;
-.globl	${PREFIX}_ocb_encrypt
-.type	${PREFIX}_ocb_encrypt,\@function,6
-.align	32
-${PREFIX}_ocb_encrypt:
-.cfi_startproc
-	lea	(%rsp),%rax
-	push	%rbx
-.cfi_push	%rbx
-	push	%rbp
-.cfi_push	%rbp
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-___
-$code.=<<___ if ($win64);
-	lea	-0xa0(%rsp),%rsp
-	movaps	%xmm6,0x00(%rsp)		# offload everything
-	movaps	%xmm7,0x10(%rsp)
-	movaps	%xmm8,0x20(%rsp)
-	movaps	%xmm9,0x30(%rsp)
-	movaps	%xmm10,0x40(%rsp)
-	movaps	%xmm11,0x50(%rsp)
-	movaps	%xmm12,0x60(%rsp)
-	movaps	%xmm13,0x70(%rsp)
-	movaps	%xmm14,0x80(%rsp)
-	movaps	%xmm15,0x90(%rsp)
-.Locb_enc_body:
-___
-$code.=<<___;
-	mov	$seventh_arg(%rax),$L_p		# 7th argument
-	mov	$seventh_arg+8(%rax),$checksum_p# 8th argument
-
-	mov	240($key),$rnds_
-	mov	$key,$key_
-	shl	\$4,$rnds_
-	$movkey	($key),$rndkey0l		# round[0]
-	$movkey	16($key,$rnds_),$rndkey1	# round[last]
-
-	movdqu	($offset_p),@offset[5]		# load last offset_i
-	pxor	$rndkey1,$rndkey0l		# round[0] ^ round[last]
-	pxor	$rndkey1,@offset[5]		# offset_i ^ round[last]
-
-	mov	\$16+32,$rounds
-	lea	32($key_,$rnds_),$key
-	$movkey	16($key_),$rndkey1		# round[1]
-	sub	%r10,%rax			# twisted $rounds
-	mov	%rax,%r10			# backup twisted $rounds
-
-	movdqu	($L_p),@offset[0]		# L_0 for all odd-numbered blocks
-	movdqu	($checksum_p),$checksum		# load checksum
-
-	test	\$1,$block_num			# is first block number odd?
-	jnz	.Locb_enc_odd
-
-	bsf	$block_num,$i1
-	add	\$1,$block_num
-	shl	\$4,$i1
-	movdqu	($L_p,$i1),$inout5		# borrow
-	movdqu	($inp),$inout0
-	lea	16($inp),$inp
-
-	call	__ocb_encrypt1
-
-	movdqa	$inout5,@offset[5]
-	movups	$inout0,($out)
-	lea	16($out),$out
-	sub	\$1,$blocks
-	jz	.Locb_enc_done
-
-.Locb_enc_odd:
-	lea	1($block_num),$i1		# even-numbered blocks
-	lea	3($block_num),$i3
-	lea	5($block_num),$i5
-	lea	6($block_num),$block_num
-	bsf	$i1,$i1				# ntz(block)
-	bsf	$i3,$i3
-	bsf	$i5,$i5
-	shl	\$4,$i1				# ntz(block) -> table offset
-	shl	\$4,$i3
-	shl	\$4,$i5
-
-	sub	\$6,$blocks
-	jc	.Locb_enc_short
-	jmp	.Locb_enc_grandloop
-
-.align	32
-.Locb_enc_grandloop:
-	movdqu	`16*0`($inp),$inout0		# load input
-	movdqu	`16*1`($inp),$inout1
-	movdqu	`16*2`($inp),$inout2
-	movdqu	`16*3`($inp),$inout3
-	movdqu	`16*4`($inp),$inout4
-	movdqu	`16*5`($inp),$inout5
-	lea	`16*6`($inp),$inp
-
-	call	__ocb_encrypt6
-
-	movups	$inout0,`16*0`($out)		# store output
-	movups	$inout1,`16*1`($out)
-	movups	$inout2,`16*2`($out)
-	movups	$inout3,`16*3`($out)
-	movups	$inout4,`16*4`($out)
-	movups	$inout5,`16*5`($out)
-	lea	`16*6`($out),$out
-	sub	\$6,$blocks
-	jnc	.Locb_enc_grandloop
-
-.Locb_enc_short:
-	add	\$6,$blocks
-	jz	.Locb_enc_done
-
-	movdqu	`16*0`($inp),$inout0
-	cmp	\$2,$blocks
-	jb	.Locb_enc_one
-	movdqu	`16*1`($inp),$inout1
-	je	.Locb_enc_two
-
-	movdqu	`16*2`($inp),$inout2
-	cmp	\$4,$blocks
-	jb	.Locb_enc_three
-	movdqu	`16*3`($inp),$inout3
-	je	.Locb_enc_four
-
-	movdqu	`16*4`($inp),$inout4
-	pxor	$inout5,$inout5
-
-	call	__ocb_encrypt6
-
-	movdqa	@offset[4],@offset[5]
-	movups	$inout0,`16*0`($out)
-	movups	$inout1,`16*1`($out)
-	movups	$inout2,`16*2`($out)
-	movups	$inout3,`16*3`($out)
-	movups	$inout4,`16*4`($out)
-
-	jmp	.Locb_enc_done
-
-.align	16
-.Locb_enc_one:
-	movdqa	@offset[0],$inout5		# borrow
-
-	call	__ocb_encrypt1
-
-	movdqa	$inout5,@offset[5]
-	movups	$inout0,`16*0`($out)
-	jmp	.Locb_enc_done
-
-.align	16
-.Locb_enc_two:
-	pxor	$inout2,$inout2
-	pxor	$inout3,$inout3
-
-	call	__ocb_encrypt4
-
-	movdqa	@offset[1],@offset[5]
-	movups	$inout0,`16*0`($out)
-	movups	$inout1,`16*1`($out)
-
-	jmp	.Locb_enc_done
-
-.align	16
-.Locb_enc_three:
-	pxor	$inout3,$inout3
-
-	call	__ocb_encrypt4
-
-	movdqa	@offset[2],@offset[5]
-	movups	$inout0,`16*0`($out)
-	movups	$inout1,`16*1`($out)
-	movups	$inout2,`16*2`($out)
-
-	jmp	.Locb_enc_done
-
-.align	16
-.Locb_enc_four:
-	call	__ocb_encrypt4
-
-	movdqa	@offset[3],@offset[5]
-	movups	$inout0,`16*0`($out)
-	movups	$inout1,`16*1`($out)
-	movups	$inout2,`16*2`($out)
-	movups	$inout3,`16*3`($out)
-
-.Locb_enc_done:
-	pxor	$rndkey0,@offset[5]		# "remove" round[last]
-	movdqu	$checksum,($checksum_p)		# store checksum
-	movdqu	@offset[5],($offset_p)		# store last offset_i
-
-	xorps	%xmm0,%xmm0			# clear register bank
-	pxor	%xmm1,%xmm1
-	pxor	%xmm2,%xmm2
-	pxor	%xmm3,%xmm3
-	pxor	%xmm4,%xmm4
-	pxor	%xmm5,%xmm5
-___
-$code.=<<___ if (!$win64);
-	pxor	%xmm6,%xmm6
-	pxor	%xmm7,%xmm7
-	pxor	%xmm8,%xmm8
-	pxor	%xmm9,%xmm9
-	pxor	%xmm10,%xmm10
-	pxor	%xmm11,%xmm11
-	pxor	%xmm12,%xmm12
-	pxor	%xmm13,%xmm13
-	pxor	%xmm14,%xmm14
-	pxor	%xmm15,%xmm15
-	lea	0x28(%rsp),%rax
-.cfi_def_cfa	%rax,8
-___
-$code.=<<___ if ($win64);
-	movaps	0x00(%rsp),%xmm6
-	movaps	%xmm0,0x00(%rsp)		# clear stack
-	movaps	0x10(%rsp),%xmm7
-	movaps	%xmm0,0x10(%rsp)
-	movaps	0x20(%rsp),%xmm8
-	movaps	%xmm0,0x20(%rsp)
-	movaps	0x30(%rsp),%xmm9
-	movaps	%xmm0,0x30(%rsp)
-	movaps	0x40(%rsp),%xmm10
-	movaps	%xmm0,0x40(%rsp)
-	movaps	0x50(%rsp),%xmm11
-	movaps	%xmm0,0x50(%rsp)
-	movaps	0x60(%rsp),%xmm12
-	movaps	%xmm0,0x60(%rsp)
-	movaps	0x70(%rsp),%xmm13
-	movaps	%xmm0,0x70(%rsp)
-	movaps	0x80(%rsp),%xmm14
-	movaps	%xmm0,0x80(%rsp)
-	movaps	0x90(%rsp),%xmm15
-	movaps	%xmm0,0x90(%rsp)
-	lea	0xa0+0x28(%rsp),%rax
-.Locb_enc_pop:
-___
-$code.=<<___;
-	mov	-40(%rax),%r14
-.cfi_restore	%r14
-	mov	-32(%rax),%r13
-.cfi_restore	%r13
-	mov	-24(%rax),%r12
-.cfi_restore	%r12
-	mov	-16(%rax),%rbp
-.cfi_restore	%rbp
-	mov	-8(%rax),%rbx
-.cfi_restore	%rbx
-	lea	(%rax),%rsp
-.cfi_def_cfa_register	%rsp
-.Locb_enc_epilogue:
-	ret
-.cfi_endproc
-.size	${PREFIX}_ocb_encrypt,.-${PREFIX}_ocb_encrypt
-
-.type	__ocb_encrypt6,\@abi-omnipotent
-.align	32
-__ocb_encrypt6:
-	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
-	 movdqu		($L_p,$i1),@offset[1]
-	 movdqa		@offset[0],@offset[2]
-	 movdqu		($L_p,$i3),@offset[3]
-	 movdqa		@offset[0],@offset[4]
-	 pxor		@offset[5],@offset[0]
-	 movdqu		($L_p,$i5),@offset[5]
-	 pxor		@offset[0],@offset[1]
-	pxor		$inout0,$checksum	# accumulate checksum
-	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
-	 pxor		@offset[1],@offset[2]
-	pxor		$inout1,$checksum
-	pxor		@offset[1],$inout1
-	 pxor		@offset[2],@offset[3]
-	pxor		$inout2,$checksum
-	pxor		@offset[2],$inout2
-	 pxor		@offset[3],@offset[4]
-	pxor		$inout3,$checksum
-	pxor		@offset[3],$inout3
-	 pxor		@offset[4],@offset[5]
-	pxor		$inout4,$checksum
-	pxor		@offset[4],$inout4
-	pxor		$inout5,$checksum
-	pxor		@offset[5],$inout5
-	$movkey		32($key_),$rndkey0
-
-	lea		1($block_num),$i1	# even-numbered blocks
-	lea		3($block_num),$i3
-	lea		5($block_num),$i5
-	add		\$6,$block_num
-	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
-	bsf		$i1,$i1			# ntz(block)
-	bsf		$i3,$i3
-	bsf		$i5,$i5
-
-	aesenc		$rndkey1,$inout0
-	aesenc		$rndkey1,$inout1
-	aesenc		$rndkey1,$inout2
-	aesenc		$rndkey1,$inout3
-	 pxor		$rndkey0l,@offset[1]
-	 pxor		$rndkey0l,@offset[2]
-	aesenc		$rndkey1,$inout4
-	 pxor		$rndkey0l,@offset[3]
-	 pxor		$rndkey0l,@offset[4]
-	aesenc		$rndkey1,$inout5
-	$movkey		48($key_),$rndkey1
-	 pxor		$rndkey0l,@offset[5]
-
-	aesenc		$rndkey0,$inout0
-	aesenc		$rndkey0,$inout1
-	aesenc		$rndkey0,$inout2
-	aesenc		$rndkey0,$inout3
-	aesenc		$rndkey0,$inout4
-	aesenc		$rndkey0,$inout5
-	$movkey		64($key_),$rndkey0
-	shl		\$4,$i1			# ntz(block) -> table offset
-	shl		\$4,$i3
-	jmp		.Locb_enc_loop6
-
-.align	32
-.Locb_enc_loop6:
-	aesenc		$rndkey1,$inout0
-	aesenc		$rndkey1,$inout1
-	aesenc		$rndkey1,$inout2
-	aesenc		$rndkey1,$inout3
-	aesenc		$rndkey1,$inout4
-	aesenc		$rndkey1,$inout5
-	$movkey		($key,%rax),$rndkey1
-	add		\$32,%rax
-
-	aesenc		$rndkey0,$inout0
-	aesenc		$rndkey0,$inout1
-	aesenc		$rndkey0,$inout2
-	aesenc		$rndkey0,$inout3
-	aesenc		$rndkey0,$inout4
-	aesenc		$rndkey0,$inout5
-	$movkey		-16($key,%rax),$rndkey0
-	jnz		.Locb_enc_loop6
-
-	aesenc		$rndkey1,$inout0
-	aesenc		$rndkey1,$inout1
-	aesenc		$rndkey1,$inout2
-	aesenc		$rndkey1,$inout3
-	aesenc		$rndkey1,$inout4
-	aesenc		$rndkey1,$inout5
-	$movkey		16($key_),$rndkey1
-	shl		\$4,$i5
-
-	aesenclast	@offset[0],$inout0
-	movdqu		($L_p),@offset[0]	# L_0 for all odd-numbered blocks
-	mov		%r10,%rax		# restore twisted rounds
-	aesenclast	@offset[1],$inout1
-	aesenclast	@offset[2],$inout2
-	aesenclast	@offset[3],$inout3
-	aesenclast	@offset[4],$inout4
-	aesenclast	@offset[5],$inout5
-	ret
-.size	__ocb_encrypt6,.-__ocb_encrypt6
-
-.type	__ocb_encrypt4,\@abi-omnipotent
-.align	32
-__ocb_encrypt4:
-	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
-	 movdqu		($L_p,$i1),@offset[1]
-	 movdqa		@offset[0],@offset[2]
-	 movdqu		($L_p,$i3),@offset[3]
-	 pxor		@offset[5],@offset[0]
-	 pxor		@offset[0],@offset[1]
-	pxor		$inout0,$checksum	# accumulate checksum
-	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
-	 pxor		@offset[1],@offset[2]
-	pxor		$inout1,$checksum
-	pxor		@offset[1],$inout1
-	 pxor		@offset[2],@offset[3]
-	pxor		$inout2,$checksum
-	pxor		@offset[2],$inout2
-	pxor		$inout3,$checksum
-	pxor		@offset[3],$inout3
-	$movkey		32($key_),$rndkey0
-
-	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
-	 pxor		$rndkey0l,@offset[1]
-	 pxor		$rndkey0l,@offset[2]
-	 pxor		$rndkey0l,@offset[3]
-
-	aesenc		$rndkey1,$inout0
-	aesenc		$rndkey1,$inout1
-	aesenc		$rndkey1,$inout2
-	aesenc		$rndkey1,$inout3
-	$movkey		48($key_),$rndkey1
-
-	aesenc		$rndkey0,$inout0
-	aesenc		$rndkey0,$inout1
-	aesenc		$rndkey0,$inout2
-	aesenc		$rndkey0,$inout3
-	$movkey		64($key_),$rndkey0
-	jmp		.Locb_enc_loop4
-
-.align	32
-.Locb_enc_loop4:
-	aesenc		$rndkey1,$inout0
-	aesenc		$rndkey1,$inout1
-	aesenc		$rndkey1,$inout2
-	aesenc		$rndkey1,$inout3
-	$movkey		($key,%rax),$rndkey1
-	add		\$32,%rax
-
-	aesenc		$rndkey0,$inout0
-	aesenc		$rndkey0,$inout1
-	aesenc		$rndkey0,$inout2
-	aesenc		$rndkey0,$inout3
-	$movkey		-16($key,%rax),$rndkey0
-	jnz		.Locb_enc_loop4
-
-	aesenc		$rndkey1,$inout0
-	aesenc		$rndkey1,$inout1
-	aesenc		$rndkey1,$inout2
-	aesenc		$rndkey1,$inout3
-	$movkey		16($key_),$rndkey1
-	mov		%r10,%rax		# restore twisted rounds
-
-	aesenclast	@offset[0],$inout0
-	aesenclast	@offset[1],$inout1
-	aesenclast	@offset[2],$inout2
-	aesenclast	@offset[3],$inout3
-	ret
-.size	__ocb_encrypt4,.-__ocb_encrypt4
-
-.type	__ocb_encrypt1,\@abi-omnipotent
-.align	32
-__ocb_encrypt1:
-	 pxor		@offset[5],$inout5	# offset_i
-	 pxor		$rndkey0l,$inout5	# offset_i ^ round[0]
-	pxor		$inout0,$checksum	# accumulate checksum
-	pxor		$inout5,$inout0		# input ^ round[0] ^ offset_i
-	$movkey		32($key_),$rndkey0
-
-	aesenc		$rndkey1,$inout0
-	$movkey		48($key_),$rndkey1
-	pxor		$rndkey0l,$inout5	# offset_i ^ round[last]
-
-	aesenc		$rndkey0,$inout0
-	$movkey		64($key_),$rndkey0
-	jmp		.Locb_enc_loop1
-
-.align	32
-.Locb_enc_loop1:
-	aesenc		$rndkey1,$inout0
-	$movkey		($key,%rax),$rndkey1
-	add		\$32,%rax
-
-	aesenc		$rndkey0,$inout0
-	$movkey		-16($key,%rax),$rndkey0
-	jnz		.Locb_enc_loop1
-
-	aesenc		$rndkey1,$inout0
-	$movkey		16($key_),$rndkey1	# redundant in tail
-	mov		%r10,%rax		# restore twisted rounds
-
-	aesenclast	$inout5,$inout0
-	ret
-.size	__ocb_encrypt1,.-__ocb_encrypt1
-
-.globl	${PREFIX}_ocb_decrypt
-.type	${PREFIX}_ocb_decrypt,\@function,6
-.align	32
-${PREFIX}_ocb_decrypt:
-.cfi_startproc
-	lea	(%rsp),%rax
-	push	%rbx
-.cfi_push	%rbx
-	push	%rbp
-.cfi_push	%rbp
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-___
-$code.=<<___ if ($win64);
-	lea	-0xa0(%rsp),%rsp
-	movaps	%xmm6,0x00(%rsp)		# offload everything
-	movaps	%xmm7,0x10(%rsp)
-	movaps	%xmm8,0x20(%rsp)
-	movaps	%xmm9,0x30(%rsp)
-	movaps	%xmm10,0x40(%rsp)
-	movaps	%xmm11,0x50(%rsp)
-	movaps	%xmm12,0x60(%rsp)
-	movaps	%xmm13,0x70(%rsp)
-	movaps	%xmm14,0x80(%rsp)
-	movaps	%xmm15,0x90(%rsp)
-.Locb_dec_body:
-___
-$code.=<<___;
-	mov	$seventh_arg(%rax),$L_p		# 7th argument
-	mov	$seventh_arg+8(%rax),$checksum_p# 8th argument
-
-	mov	240($key),$rnds_
-	mov	$key,$key_
-	shl	\$4,$rnds_
-	$movkey	($key),$rndkey0l		# round[0]
-	$movkey	16($key,$rnds_),$rndkey1	# round[last]
-
-	movdqu	($offset_p),@offset[5]		# load last offset_i
-	pxor	$rndkey1,$rndkey0l		# round[0] ^ round[last]
-	pxor	$rndkey1,@offset[5]		# offset_i ^ round[last]
-
-	mov	\$16+32,$rounds
-	lea	32($key_,$rnds_),$key
-	$movkey	16($key_),$rndkey1		# round[1]
-	sub	%r10,%rax			# twisted $rounds
-	mov	%rax,%r10			# backup twisted $rounds
-
-	movdqu	($L_p),@offset[0]		# L_0 for all odd-numbered blocks
-	movdqu	($checksum_p),$checksum		# load checksum
-
-	test	\$1,$block_num			# is first block number odd?
-	jnz	.Locb_dec_odd
-
-	bsf	$block_num,$i1
-	add	\$1,$block_num
-	shl	\$4,$i1
-	movdqu	($L_p,$i1),$inout5		# borrow
-	movdqu	($inp),$inout0
-	lea	16($inp),$inp
-
-	call	__ocb_decrypt1
-
-	movdqa	$inout5,@offset[5]
-	movups	$inout0,($out)
-	xorps	$inout0,$checksum		# accumulate checksum
-	lea	16($out),$out
-	sub	\$1,$blocks
-	jz	.Locb_dec_done
-
-.Locb_dec_odd:
-	lea	1($block_num),$i1		# even-numbered blocks
-	lea	3($block_num),$i3
-	lea	5($block_num),$i5
-	lea	6($block_num),$block_num
-	bsf	$i1,$i1				# ntz(block)
-	bsf	$i3,$i3
-	bsf	$i5,$i5
-	shl	\$4,$i1				# ntz(block) -> table offset
-	shl	\$4,$i3
-	shl	\$4,$i5
-
-	sub	\$6,$blocks
-	jc	.Locb_dec_short
-	jmp	.Locb_dec_grandloop
-
-.align	32
-.Locb_dec_grandloop:
-	movdqu	`16*0`($inp),$inout0		# load input
-	movdqu	`16*1`($inp),$inout1
-	movdqu	`16*2`($inp),$inout2
-	movdqu	`16*3`($inp),$inout3
-	movdqu	`16*4`($inp),$inout4
-	movdqu	`16*5`($inp),$inout5
-	lea	`16*6`($inp),$inp
-
-	call	__ocb_decrypt6
-
-	movups	$inout0,`16*0`($out)		# store output
-	pxor	$inout0,$checksum		# accumulate checksum
-	movups	$inout1,`16*1`($out)
-	pxor	$inout1,$checksum
-	movups	$inout2,`16*2`($out)
-	pxor	$inout2,$checksum
-	movups	$inout3,`16*3`($out)
-	pxor	$inout3,$checksum
-	movups	$inout4,`16*4`($out)
-	pxor	$inout4,$checksum
-	movups	$inout5,`16*5`($out)
-	pxor	$inout5,$checksum
-	lea	`16*6`($out),$out
-	sub	\$6,$blocks
-	jnc	.Locb_dec_grandloop
-
-.Locb_dec_short:
-	add	\$6,$blocks
-	jz	.Locb_dec_done
-
-	movdqu	`16*0`($inp),$inout0
-	cmp	\$2,$blocks
-	jb	.Locb_dec_one
-	movdqu	`16*1`($inp),$inout1
-	je	.Locb_dec_two
-
-	movdqu	`16*2`($inp),$inout2
-	cmp	\$4,$blocks
-	jb	.Locb_dec_three
-	movdqu	`16*3`($inp),$inout3
-	je	.Locb_dec_four
-
-	movdqu	`16*4`($inp),$inout4
-	pxor	$inout5,$inout5
-
-	call	__ocb_decrypt6
-
-	movdqa	@offset[4],@offset[5]
-	movups	$inout0,`16*0`($out)		# store output
-	pxor	$inout0,$checksum		# accumulate checksum
-	movups	$inout1,`16*1`($out)
-	pxor	$inout1,$checksum
-	movups	$inout2,`16*2`($out)
-	pxor	$inout2,$checksum
-	movups	$inout3,`16*3`($out)
-	pxor	$inout3,$checksum
-	movups	$inout4,`16*4`($out)
-	pxor	$inout4,$checksum
-
-	jmp	.Locb_dec_done
-
-.align	16
-.Locb_dec_one:
-	movdqa	@offset[0],$inout5		# borrow
-
-	call	__ocb_decrypt1
-
-	movdqa	$inout5,@offset[5]
-	movups	$inout0,`16*0`($out)		# store output
-	xorps	$inout0,$checksum		# accumulate checksum
-	jmp	.Locb_dec_done
-
-.align	16
-.Locb_dec_two:
-	pxor	$inout2,$inout2
-	pxor	$inout3,$inout3
-
-	call	__ocb_decrypt4
-
-	movdqa	@offset[1],@offset[5]
-	movups	$inout0,`16*0`($out)		# store output
-	xorps	$inout0,$checksum		# accumulate checksum
-	movups	$inout1,`16*1`($out)
-	xorps	$inout1,$checksum
-
-	jmp	.Locb_dec_done
-
-.align	16
-.Locb_dec_three:
-	pxor	$inout3,$inout3
-
-	call	__ocb_decrypt4
-
-	movdqa	@offset[2],@offset[5]
-	movups	$inout0,`16*0`($out)		# store output
-	xorps	$inout0,$checksum		# accumulate checksum
-	movups	$inout1,`16*1`($out)
-	xorps	$inout1,$checksum
-	movups	$inout2,`16*2`($out)
-	xorps	$inout2,$checksum
-
-	jmp	.Locb_dec_done
-
-.align	16
-.Locb_dec_four:
-	call	__ocb_decrypt4
-
-	movdqa	@offset[3],@offset[5]
-	movups	$inout0,`16*0`($out)		# store output
-	pxor	$inout0,$checksum		# accumulate checksum
-	movups	$inout1,`16*1`($out)
-	pxor	$inout1,$checksum
-	movups	$inout2,`16*2`($out)
-	pxor	$inout2,$checksum
-	movups	$inout3,`16*3`($out)
-	pxor	$inout3,$checksum
-
-.Locb_dec_done:
-	pxor	$rndkey0,@offset[5]		# "remove" round[last]
-	movdqu	$checksum,($checksum_p)		# store checksum
-	movdqu	@offset[5],($offset_p)		# store last offset_i
-
-	xorps	%xmm0,%xmm0			# clear register bank
-	pxor	%xmm1,%xmm1
-	pxor	%xmm2,%xmm2
-	pxor	%xmm3,%xmm3
-	pxor	%xmm4,%xmm4
-	pxor	%xmm5,%xmm5
-___
-$code.=<<___ if (!$win64);
-	pxor	%xmm6,%xmm6
-	pxor	%xmm7,%xmm7
-	pxor	%xmm8,%xmm8
-	pxor	%xmm9,%xmm9
-	pxor	%xmm10,%xmm10
-	pxor	%xmm11,%xmm11
-	pxor	%xmm12,%xmm12
-	pxor	%xmm13,%xmm13
-	pxor	%xmm14,%xmm14
-	pxor	%xmm15,%xmm15
-	lea	0x28(%rsp),%rax
-.cfi_def_cfa	%rax,8
-___
-$code.=<<___ if ($win64);
-	movaps	0x00(%rsp),%xmm6
-	movaps	%xmm0,0x00(%rsp)		# clear stack
-	movaps	0x10(%rsp),%xmm7
-	movaps	%xmm0,0x10(%rsp)
-	movaps	0x20(%rsp),%xmm8
-	movaps	%xmm0,0x20(%rsp)
-	movaps	0x30(%rsp),%xmm9
-	movaps	%xmm0,0x30(%rsp)
-	movaps	0x40(%rsp),%xmm10
-	movaps	%xmm0,0x40(%rsp)
-	movaps	0x50(%rsp),%xmm11
-	movaps	%xmm0,0x50(%rsp)
-	movaps	0x60(%rsp),%xmm12
-	movaps	%xmm0,0x60(%rsp)
-	movaps	0x70(%rsp),%xmm13
-	movaps	%xmm0,0x70(%rsp)
-	movaps	0x80(%rsp),%xmm14
-	movaps	%xmm0,0x80(%rsp)
-	movaps	0x90(%rsp),%xmm15
-	movaps	%xmm0,0x90(%rsp)
-	lea	0xa0+0x28(%rsp),%rax
-.Locb_dec_pop:
-___
-$code.=<<___;
-	mov	-40(%rax),%r14
-.cfi_restore	%r14
-	mov	-32(%rax),%r13
-.cfi_restore	%r13
-	mov	-24(%rax),%r12
-.cfi_restore	%r12
-	mov	-16(%rax),%rbp
-.cfi_restore	%rbp
-	mov	-8(%rax),%rbx
-.cfi_restore	%rbx
-	lea	(%rax),%rsp
-.cfi_def_cfa_register	%rsp
-.Locb_dec_epilogue:
-	ret
-.cfi_endproc
-.size	${PREFIX}_ocb_decrypt,.-${PREFIX}_ocb_decrypt
-
-.type	__ocb_decrypt6,\@abi-omnipotent
-.align	32
-__ocb_decrypt6:
-	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
-	 movdqu		($L_p,$i1),@offset[1]
-	 movdqa		@offset[0],@offset[2]
-	 movdqu		($L_p,$i3),@offset[3]
-	 movdqa		@offset[0],@offset[4]
-	 pxor		@offset[5],@offset[0]
-	 movdqu		($L_p,$i5),@offset[5]
-	 pxor		@offset[0],@offset[1]
-	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
-	 pxor		@offset[1],@offset[2]
-	pxor		@offset[1],$inout1
-	 pxor		@offset[2],@offset[3]
-	pxor		@offset[2],$inout2
-	 pxor		@offset[3],@offset[4]
-	pxor		@offset[3],$inout3
-	 pxor		@offset[4],@offset[5]
-	pxor		@offset[4],$inout4
-	pxor		@offset[5],$inout5
-	$movkey		32($key_),$rndkey0
-
-	lea		1($block_num),$i1	# even-numbered blocks
-	lea		3($block_num),$i3
-	lea		5($block_num),$i5
-	add		\$6,$block_num
-	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
-	bsf		$i1,$i1			# ntz(block)
-	bsf		$i3,$i3
-	bsf		$i5,$i5
-
-	aesdec		$rndkey1,$inout0
-	aesdec		$rndkey1,$inout1
-	aesdec		$rndkey1,$inout2
-	aesdec		$rndkey1,$inout3
-	 pxor		$rndkey0l,@offset[1]
-	 pxor		$rndkey0l,@offset[2]
-	aesdec		$rndkey1,$inout4
-	 pxor		$rndkey0l,@offset[3]
-	 pxor		$rndkey0l,@offset[4]
-	aesdec		$rndkey1,$inout5
-	$movkey		48($key_),$rndkey1
-	 pxor		$rndkey0l,@offset[5]
-
-	aesdec		$rndkey0,$inout0
-	aesdec		$rndkey0,$inout1
-	aesdec		$rndkey0,$inout2
-	aesdec		$rndkey0,$inout3
-	aesdec		$rndkey0,$inout4
-	aesdec		$rndkey0,$inout5
-	$movkey		64($key_),$rndkey0
-	shl		\$4,$i1			# ntz(block) -> table offset
-	shl		\$4,$i3
-	jmp		.Locb_dec_loop6
-
-.align	32
-.Locb_dec_loop6:
-	aesdec		$rndkey1,$inout0
-	aesdec		$rndkey1,$inout1
-	aesdec		$rndkey1,$inout2
-	aesdec		$rndkey1,$inout3
-	aesdec		$rndkey1,$inout4
-	aesdec		$rndkey1,$inout5
-	$movkey		($key,%rax),$rndkey1
-	add		\$32,%rax
-
-	aesdec		$rndkey0,$inout0
-	aesdec		$rndkey0,$inout1
-	aesdec		$rndkey0,$inout2
-	aesdec		$rndkey0,$inout3
-	aesdec		$rndkey0,$inout4
-	aesdec		$rndkey0,$inout5
-	$movkey		-16($key,%rax),$rndkey0
-	jnz		.Locb_dec_loop6
-
-	aesdec		$rndkey1,$inout0
-	aesdec		$rndkey1,$inout1
-	aesdec		$rndkey1,$inout2
-	aesdec		$rndkey1,$inout3
-	aesdec		$rndkey1,$inout4
-	aesdec		$rndkey1,$inout5
-	$movkey		16($key_),$rndkey1
-	shl		\$4,$i5
-
-	aesdeclast	@offset[0],$inout0
-	movdqu		($L_p),@offset[0]	# L_0 for all odd-numbered blocks
-	mov		%r10,%rax		# restore twisted rounds
-	aesdeclast	@offset[1],$inout1
-	aesdeclast	@offset[2],$inout2
-	aesdeclast	@offset[3],$inout3
-	aesdeclast	@offset[4],$inout4
-	aesdeclast	@offset[5],$inout5
-	ret
-.size	__ocb_decrypt6,.-__ocb_decrypt6
-
-.type	__ocb_decrypt4,\@abi-omnipotent
-.align	32
-__ocb_decrypt4:
-	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
-	 movdqu		($L_p,$i1),@offset[1]
-	 movdqa		@offset[0],@offset[2]
-	 movdqu		($L_p,$i3),@offset[3]
-	 pxor		@offset[5],@offset[0]
-	 pxor		@offset[0],@offset[1]
-	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
-	 pxor		@offset[1],@offset[2]
-	pxor		@offset[1],$inout1
-	 pxor		@offset[2],@offset[3]
-	pxor		@offset[2],$inout2
-	pxor		@offset[3],$inout3
-	$movkey		32($key_),$rndkey0
-
-	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
-	 pxor		$rndkey0l,@offset[1]
-	 pxor		$rndkey0l,@offset[2]
-	 pxor		$rndkey0l,@offset[3]
-
-	aesdec		$rndkey1,$inout0
-	aesdec		$rndkey1,$inout1
-	aesdec		$rndkey1,$inout2
-	aesdec		$rndkey1,$inout3
-	$movkey		48($key_),$rndkey1
-
-	aesdec		$rndkey0,$inout0
-	aesdec		$rndkey0,$inout1
-	aesdec		$rndkey0,$inout2
-	aesdec		$rndkey0,$inout3
-	$movkey		64($key_),$rndkey0
-	jmp		.Locb_dec_loop4
-
-.align	32
-.Locb_dec_loop4:
-	aesdec		$rndkey1,$inout0
-	aesdec		$rndkey1,$inout1
-	aesdec		$rndkey1,$inout2
-	aesdec		$rndkey1,$inout3
-	$movkey		($key,%rax),$rndkey1
-	add		\$32,%rax
-
-	aesdec		$rndkey0,$inout0
-	aesdec		$rndkey0,$inout1
-	aesdec		$rndkey0,$inout2
-	aesdec		$rndkey0,$inout3
-	$movkey		-16($key,%rax),$rndkey0
-	jnz		.Locb_dec_loop4
-
-	aesdec		$rndkey1,$inout0
-	aesdec		$rndkey1,$inout1
-	aesdec		$rndkey1,$inout2
-	aesdec		$rndkey1,$inout3
-	$movkey		16($key_),$rndkey1
-	mov		%r10,%rax		# restore twisted rounds
-
-	aesdeclast	@offset[0],$inout0
-	aesdeclast	@offset[1],$inout1
-	aesdeclast	@offset[2],$inout2
-	aesdeclast	@offset[3],$inout3
-	ret
-.size	__ocb_decrypt4,.-__ocb_decrypt4
-
-.type	__ocb_decrypt1,\@abi-omnipotent
-.align	32
-__ocb_decrypt1:
-	 pxor		@offset[5],$inout5	# offset_i
-	 pxor		$rndkey0l,$inout5	# offset_i ^ round[0]
-	pxor		$inout5,$inout0		# input ^ round[0] ^ offset_i
-	$movkey		32($key_),$rndkey0
-
-	aesdec		$rndkey1,$inout0
-	$movkey		48($key_),$rndkey1
-	pxor		$rndkey0l,$inout5	# offset_i ^ round[last]
-
-	aesdec		$rndkey0,$inout0
-	$movkey		64($key_),$rndkey0
-	jmp		.Locb_dec_loop1
-
-.align	32
-.Locb_dec_loop1:
-	aesdec		$rndkey1,$inout0
-	$movkey		($key,%rax),$rndkey1
-	add		\$32,%rax
-
-	aesdec		$rndkey0,$inout0
-	$movkey		-16($key,%rax),$rndkey0
-	jnz		.Locb_dec_loop1
-
-	aesdec		$rndkey1,$inout0
-	$movkey		16($key_),$rndkey1	# redundant in tail
-	mov		%r10,%rax		# restore twisted rounds
-
-	aesdeclast	$inout5,$inout0
-	ret
-.size	__ocb_decrypt1,.-__ocb_decrypt1
-___
 } }}
 
 ########################################################################