crypto/aes/asm/aesv8-armx.pl - boringssl - Git at Google

 #!/usr/bin/env perl
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
 # This module implements support for ARMv8 AES instructions. The
 # module is endian-agnostic in sense that it supports both big- and
 # little-endian cases. As does it support both 32- and 64-bit modes
 # of operation. Latter is achieved by limiting amount of utilized
 # registers to 16, which implies additional NEON load and integer
 # instructions. This has no effect on mighty Apple A7, where results
 # are literally equal to the theoretical estimates based on AES
 # instruction latencies and issue rates. On Cortex-A53, an in-order
 # execution core, this costs up to 10-15%, which is partially
 # compensated by implementing dedicated code path for 128-bit
 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
 # seems to be limited by sheer amount of NEON instructions...
 #
 # Performance in cycles per byte processed with 128-bit key:
 #
 #		CBC enc		CBC dec		CTR
 # Apple A7	2.39		1.20		1.20
 # Cortex-A53	2.45		1.87		1.94
 # Cortex-A57	3.64		1.34		1.32

 $flavour = shift;
 open STDOUT,">".shift;

 $prefix="aes_v8";

 $code=<<___;
 #include "arm_arch.h"

 #if __ARM_MAX_ARCH__>=7
 .text
 ___
 $code.=".arch	armv8-a+crypto\n"			if ($flavour =~ /64/);
 $code.=".arch	armv7-a\n.fpu	neon\n.code	32\n"	if ($flavour !~ /64/);
 		#^^^^^^ this is done to simplify adoption by not depending
 		#	on latest binutils.

 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
 # maintain both 32- and 64-bit codes within single module and
 # transliterate common code to either flavour with regex vodoo.
 #
 {{{
 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
 	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));


 $code.=<<___;
 .align	5
 rcon:
 .long	0x01,0x01,0x01,0x01
 .long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
 .long	0x1b,0x1b,0x1b,0x1b

 .globl	${prefix}_set_encrypt_key
 .type	${prefix}_set_encrypt_key,%function
 .align	5
 ${prefix}_set_encrypt_key:
 .Lenc_key:
 ___
 $code.=<<___	if ($flavour =~ /64/);
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 ___
 $code.=<<___;
 	mov	$ptr,#-1
 	cmp	$inp,#0
 	b.eq	.Lenc_key_abort
 	cmp	$out,#0
 	b.eq	.Lenc_key_abort
 	mov	$ptr,#-2
 	cmp	$bits,#128
 	b.lt	.Lenc_key_abort
 	cmp	$bits,#256
 	b.gt	.Lenc_key_abort
 	tst	$bits,#0x3f
 	b.ne	.Lenc_key_abort

 	adr	$ptr,rcon
 	cmp	$bits,#192

 	veor	$zero,$zero,$zero
 	vld1.8	{$in0},[$inp],#16
 	mov	$bits,#8		// reuse $bits
 	vld1.32	{$rcon,$mask},[$ptr],#32

 	b.lt	.Loop128
 	b.eq	.L192
 	b	.L256

 .align	4
 .Loop128:
 	vtbl.8	$key,{$in0},$mask
 	vext.8	$tmp,$zero,$in0,#12
 	vst1.32	{$in0},[$out],#16
 	aese	$key,$zero
 	subs	$bits,$bits,#1

 	veor	$in0,$in0,$tmp
 	vext.8	$tmp,$zero,$tmp,#12
 	veor	$in0,$in0,$tmp
 	vext.8	$tmp,$zero,$tmp,#12
 	 veor	$key,$key,$rcon
 	veor	$in0,$in0,$tmp
 	vshl.u8	$rcon,$rcon,#1
 	veor	$in0,$in0,$key
 	b.ne	.Loop128

 	vld1.32	{$rcon},[$ptr]

 	vtbl.8	$key,{$in0},$mask
 	vext.8	$tmp,$zero,$in0,#12
 	vst1.32	{$in0},[$out],#16
 	aese	$key,$zero

 	veor	$in0,$in0,$tmp
 	vext.8	$tmp,$zero,$tmp,#12
 	veor	$in0,$in0,$tmp
 	vext.8	$tmp,$zero,$tmp,#12
 	 veor	$key,$key,$rcon
 	veor	$in0,$in0,$tmp
 	vshl.u8	$rcon,$rcon,#1
 	veor	$in0,$in0,$key

 	vtbl.8	$key,{$in0},$mask
 	vext.8	$tmp,$zero,$in0,#12
 	vst1.32	{$in0},[$out],#16
 	aese	$key,$zero

 	veor	$in0,$in0,$tmp
 	vext.8	$tmp,$zero,$tmp,#12
 	veor	$in0,$in0,$tmp
 	vext.8	$tmp,$zero,$tmp,#12
 	 veor	$key,$key,$rcon
 	veor	$in0,$in0,$tmp
 	veor	$in0,$in0,$key
 	vst1.32	{$in0},[$out]
 	add	$out,$out,#0x50

 	mov	$rounds,#10
 	b	.Ldone

 .align	4
 .L192:
 	vld1.8	{$in1},[$inp],#8
 	vmov.i8	$key,#8			// borrow $key
 	vst1.32	{$in0},[$out],#16
 	vsub.i8	$mask,$mask,$key	// adjust the mask

 .Loop192:
 	vtbl.8	$key,{$in1},$mask
 	vext.8	$tmp,$zero,$in0,#12
 	vst1.32	{$in1},[$out],#8
 	aese	$key,$zero
 	subs	$bits,$bits,#1

 	veor	$in0,$in0,$tmp
 	vext.8	$tmp,$zero,$tmp,#12
 	veor	$in0,$in0,$tmp
 	vext.8	$tmp,$zero,$tmp,#12
 	veor	$in0,$in0,$tmp

 	vdup.32	$tmp,${in0}[3]
 	veor	$tmp,$tmp,$in1
 	 veor	$key,$key,$rcon
 	vext.8	$in1,$zero,$in1,#12
 	vshl.u8	$rcon,$rcon,#1
 	veor	$in1,$in1,$tmp
 	veor	$in0,$in0,$key
 	veor	$in1,$in1,$key
 	vst1.32	{$in0},[$out],#16
 	b.ne	.Loop192

 	mov	$rounds,#12
 	add	$out,$out,#0x20
 	b	.Ldone

 .align	4
 .L256:
 	vld1.8	{$in1},[$inp]
 	mov	$bits,#7
 	mov	$rounds,#14
 	vst1.32	{$in0},[$out],#16

 .Loop256:
 	vtbl.8	$key,{$in1},$mask
 	vext.8	$tmp,$zero,$in0,#12
 	vst1.32	{$in1},[$out],#16
 	aese	$key,$zero
 	subs	$bits,$bits,#1

 	veor	$in0,$in0,$tmp
 	vext.8	$tmp,$zero,$tmp,#12
 	veor	$in0,$in0,$tmp
 	vext.8	$tmp,$zero,$tmp,#12
 	 veor	$key,$key,$rcon
 	veor	$in0,$in0,$tmp
 	vshl.u8	$rcon,$rcon,#1
 	veor	$in0,$in0,$key
 	vst1.32	{$in0},[$out],#16
 	b.eq	.Ldone

 	vdup.32	$key,${in0}[3]		// just splat
 	vext.8	$tmp,$zero,$in1,#12
 	aese	$key,$zero

 	veor	$in1,$in1,$tmp
 	vext.8	$tmp,$zero,$tmp,#12
 	veor	$in1,$in1,$tmp
 	vext.8	$tmp,$zero,$tmp,#12
 	veor	$in1,$in1,$tmp

 	veor	$in1,$in1,$key
 	b	.Loop256

 .Ldone:
 	str	$rounds,[$out]
 	mov	$ptr,#0

 .Lenc_key_abort:
 	mov	x0,$ptr			// return value
 	`"ldr	x29,[sp],#16"		if ($flavour =~ /64/)`
 	ret
 .size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key

 .globl	${prefix}_set_decrypt_key
 .type	${prefix}_set_decrypt_key,%function
 .align	5
 ${prefix}_set_decrypt_key:
 ___
 $code.=<<___	if ($flavour =~ /64/);
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 ___
 $code.=<<___	if ($flavour !~ /64/);
 	stmdb	sp!,{r4,lr}
 ___
 $code.=<<___;
 	bl	.Lenc_key

 	cmp	x0,#0
 	b.ne	.Ldec_key_abort

 	sub	$out,$out,#240		// restore original $out
 	mov	x4,#-16
 	add	$inp,$out,x12,lsl#4	// end of key schedule

 	vld1.32	{v0.16b},[$out]
 	vld1.32	{v1.16b},[$inp]
 	vst1.32	{v0.16b},[$inp],x4
 	vst1.32	{v1.16b},[$out],#16

 .Loop_imc:
 	vld1.32	{v0.16b},[$out]
 	vld1.32	{v1.16b},[$inp]
 	aesimc	v0.16b,v0.16b
 	aesimc	v1.16b,v1.16b
 	vst1.32	{v0.16b},[$inp],x4
 	vst1.32	{v1.16b},[$out],#16
 	cmp	$inp,$out
 	b.hi	.Loop_imc

 	vld1.32	{v0.16b},[$out]
 	aesimc	v0.16b,v0.16b
 	vst1.32	{v0.16b},[$inp]

 	eor	x0,x0,x0		// return value
 .Ldec_key_abort:
 ___
 $code.=<<___	if ($flavour !~ /64/);
 	ldmia	sp!,{r4,pc}
 ___
 $code.=<<___	if ($flavour =~ /64/);
 	ldp	x29,x30,[sp],#16
 	ret
 ___
 $code.=<<___;
 .size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
 ___
 }}}
 {{{
 sub gen_block () {
 my $dir = shift;
 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
 my ($inp,$out,$key)=map("x$_",(0..2));
 my $rounds="w3";
 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));

 $code.=<<___;
 .globl	${prefix}_${dir}crypt
 .type	${prefix}_${dir}crypt,%function
 .align	5
 ${prefix}_${dir}crypt:
 	ldr	$rounds,[$key,#240]
 	vld1.32	{$rndkey0},[$key],#16
 	vld1.8	{$inout},[$inp]
 	sub	$rounds,$rounds,#2
 	vld1.32	{$rndkey1},[$key],#16

 .Loop_${dir}c:
 	aes$e	$inout,$rndkey0
 	vld1.32	{$rndkey0},[$key],#16
 	aes$mc	$inout,$inout
 	subs	$rounds,$rounds,#2
 	aes$e	$inout,$rndkey1
 	vld1.32	{$rndkey1},[$key],#16
 	aes$mc	$inout,$inout
 	b.gt	.Loop_${dir}c

 	aes$e	$inout,$rndkey0
 	vld1.32	{$rndkey0},[$key]
 	aes$mc	$inout,$inout
 	aes$e	$inout,$rndkey1
 	veor	$inout,$inout,$rndkey0

 	vst1.8	{$inout},[$out]
 	ret
 .size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
 ___
 }
 &gen_block("en");
 &gen_block("de");
 }}}
 {{{
 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));

 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);

 ### q8-q15	preloaded key schedule

 $code.=<<___;
 .globl	${prefix}_cbc_encrypt
 .type	${prefix}_cbc_encrypt,%function
 .align	5
 ${prefix}_cbc_encrypt:
 ___
 $code.=<<___	if ($flavour =~ /64/);
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 ___
 $code.=<<___	if ($flavour !~ /64/);
 	mov	ip,sp
 	stmdb	sp!,{r4-r8,lr}
 	vstmdb	sp!,{d8-d15}            @ ABI specification says so
 	ldmia	ip,{r4-r5}		@ load remaining args
 ___
 $code.=<<___;
 	subs	$len,$len,#16
 	mov	$step,#16
 	b.lo	.Lcbc_abort
 	cclr	$step,eq

 	cmp	$enc,#0			// en- or decrypting?
 	ldr	$rounds,[$key,#240]
 	and	$len,$len,#-16
 	vld1.8	{$ivec},[$ivp]
 	vld1.8	{$dat},[$inp],$step

 	vld1.32	{q8-q9},[$key]		// load key schedule...
 	sub	$rounds,$rounds,#6
 	add	$key_,$key,x5,lsl#4	// pointer to last 7 round keys
 	sub	$rounds,$rounds,#2
 	vld1.32	{q10-q11},[$key_],#32
 	vld1.32	{q12-q13},[$key_],#32
 	vld1.32	{q14-q15},[$key_],#32
 	vld1.32	{$rndlast},[$key_]

 	add	$key_,$key,#32
 	mov	$cnt,$rounds
 	b.eq	.Lcbc_dec

 	cmp	$rounds,#2
 	veor	$dat,$dat,$ivec
 	veor	$rndzero_n_last,q8,$rndlast
 	b.eq	.Lcbc_enc128

 .Loop_cbc_enc:
 	aese	$dat,q8
 	vld1.32	{q8},[$key_],#16
 	aesmc	$dat,$dat
 	subs	$cnt,$cnt,#2
 	aese	$dat,q9
 	vld1.32	{q9},[$key_],#16
 	aesmc	$dat,$dat
 	b.gt	.Loop_cbc_enc

 	aese	$dat,q8
 	aesmc	$dat,$dat
 	 subs	$len,$len,#16
 	aese	$dat,q9
 	aesmc	$dat,$dat
 	 cclr	$step,eq
 	aese	$dat,q10
 	aesmc	$dat,$dat
 	 add	$key_,$key,#16
 	aese	$dat,q11
 	aesmc	$dat,$dat
 	 vld1.8	{q8},[$inp],$step
 	aese	$dat,q12
 	aesmc	$dat,$dat
 	 veor	q8,q8,$rndzero_n_last
 	aese	$dat,q13
 	aesmc	$dat,$dat
 	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
 	aese	$dat,q14
 	aesmc	$dat,$dat
 	aese	$dat,q15

 	 mov	$cnt,$rounds
 	veor	$ivec,$dat,$rndlast
 	vst1.8	{$ivec},[$out],#16
 	b.hs	.Loop_cbc_enc

 	b	.Lcbc_done

 .align	5
 .Lcbc_enc128:
 	vld1.32	{$in0-$in1},[$key_]
 	aese	$dat,q8
 	aesmc	$dat,$dat
 	b	.Lenter_cbc_enc128
 .Loop_cbc_enc128:
 	aese	$dat,q8
 	aesmc	$dat,$dat
 	 vst1.8	{$ivec},[$out],#16
 .Lenter_cbc_enc128:
 	aese	$dat,q9
 	aesmc	$dat,$dat
 	 subs	$len,$len,#16
 	aese	$dat,$in0
 	aesmc	$dat,$dat
 	 cclr	$step,eq
 	aese	$dat,$in1
 	aesmc	$dat,$dat
 	aese	$dat,q10
 	aesmc	$dat,$dat
 	aese	$dat,q11
 	aesmc	$dat,$dat
 	 vld1.8	{q8},[$inp],$step
 	aese	$dat,q12
 	aesmc	$dat,$dat
 	aese	$dat,q13
 	aesmc	$dat,$dat
 	aese	$dat,q14
 	aesmc	$dat,$dat
 	 veor	q8,q8,$rndzero_n_last
 	aese	$dat,q15
 	veor	$ivec,$dat,$rndlast
 	b.hs	.Loop_cbc_enc128

 	vst1.8	{$ivec},[$out],#16
 	b	.Lcbc_done
 ___
 {
 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
 $code.=<<___;
 .align	5
 .Lcbc_dec:
 	vld1.8	{$dat2},[$inp],#16
 	subs	$len,$len,#32		// bias
 	add	$cnt,$rounds,#2
 	vorr	$in1,$dat,$dat
 	vorr	$dat1,$dat,$dat
 	vorr	$in2,$dat2,$dat2
 	b.lo	.Lcbc_dec_tail

 	vorr	$dat1,$dat2,$dat2
 	vld1.8	{$dat2},[$inp],#16
 	vorr	$in0,$dat,$dat
 	vorr	$in1,$dat1,$dat1
 	vorr	$in2,$dat2,$dat2

 .Loop3x_cbc_dec:
 	aesd	$dat0,q8
 	aesd	$dat1,q8
 	aesd	$dat2,q8
 	vld1.32	{q8},[$key_],#16
 	aesimc	$dat0,$dat0
 	aesimc	$dat1,$dat1
 	aesimc	$dat2,$dat2
 	subs	$cnt,$cnt,#2
 	aesd	$dat0,q9
 	aesd	$dat1,q9
 	aesd	$dat2,q9
 	vld1.32	{q9},[$key_],#16
 	aesimc	$dat0,$dat0
 	aesimc	$dat1,$dat1
 	aesimc	$dat2,$dat2
 	b.gt	.Loop3x_cbc_dec

 	aesd	$dat0,q8
 	aesd	$dat1,q8
 	aesd	$dat2,q8
 	 veor	$tmp0,$ivec,$rndlast
 	aesimc	$dat0,$dat0
 	aesimc	$dat1,$dat1
 	aesimc	$dat2,$dat2
 	 veor	$tmp1,$in0,$rndlast
 	aesd	$dat0,q9
 	aesd	$dat1,q9
 	aesd	$dat2,q9
 	 veor	$tmp2,$in1,$rndlast
 	 subs	$len,$len,#0x30
 	aesimc	$dat0,$dat0
 	aesimc	$dat1,$dat1
 	aesimc	$dat2,$dat2
 	 vorr	$ivec,$in2,$in2
 	 mov.lo	x6,$len			// x6, $cnt, is zero at this point
 	aesd	$dat0,q12
 	aesd	$dat1,q12
 	aesd	$dat2,q12
 	 add	$inp,$inp,x6		// $inp is adjusted in such way that
 					// at exit from the loop $dat1-$dat2
 					// are loaded with last "words"
 	aesimc	$dat0,$dat0
 	aesimc	$dat1,$dat1
 	aesimc	$dat2,$dat2
 	 mov	$key_,$key
 	aesd	$dat0,q13
 	aesd	$dat1,q13
 	aesd	$dat2,q13
 	 vld1.8	{$in0},[$inp],#16
 	aesimc	$dat0,$dat0
 	aesimc	$dat1,$dat1
 	aesimc	$dat2,$dat2
 	 vld1.8	{$in1},[$inp],#16
 	aesd	$dat0,q14
 	aesd	$dat1,q14
 	aesd	$dat2,q14
 	 vld1.8	{$in2},[$inp],#16
 	aesimc	$dat0,$dat0
 	aesimc	$dat1,$dat1
 	aesimc	$dat2,$dat2
 	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0]
 	aesd	$dat0,q15
 	aesd	$dat1,q15
 	aesd	$dat2,q15

 	 add	$cnt,$rounds,#2
 	veor	$tmp0,$tmp0,$dat0
 	veor	$tmp1,$tmp1,$dat1
 	veor	$dat2,$dat2,$tmp2
 	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
 	 vorr	$dat0,$in0,$in0
 	vst1.8	{$tmp0},[$out],#16
 	 vorr	$dat1,$in1,$in1
 	vst1.8	{$tmp1},[$out],#16
 	vst1.8	{$dat2},[$out],#16
 	 vorr	$dat2,$in2,$in2
 	b.hs	.Loop3x_cbc_dec

 	cmn	$len,#0x30
 	b.eq	.Lcbc_done
 	nop

 .Lcbc_dec_tail:
 	aesd	$dat1,q8
 	aesd	$dat2,q8
 	vld1.32	{q8},[$key_],#16
 	aesimc	$dat1,$dat1
 	aesimc	$dat2,$dat2
 	subs	$cnt,$cnt,#2
 	aesd	$dat1,q9
 	aesd	$dat2,q9
 	vld1.32	{q9},[$key_],#16
 	aesimc	$dat1,$dat1
 	aesimc	$dat2,$dat2
 	b.gt	.Lcbc_dec_tail

 	aesd	$dat1,q8
 	aesd	$dat2,q8
 	aesimc	$dat1,$dat1
 	aesimc	$dat2,$dat2
 	aesd	$dat1,q9
 	aesd	$dat2,q9
 	aesimc	$dat1,$dat1
 	aesimc	$dat2,$dat2
 	aesd	$dat1,q12
 	aesd	$dat2,q12
 	aesimc	$dat1,$dat1
 	aesimc	$dat2,$dat2
 	 cmn	$len,#0x20
 	aesd	$dat1,q13
 	aesd	$dat2,q13
 	aesimc	$dat1,$dat1
 	aesimc	$dat2,$dat2
 	 veor	$tmp1,$ivec,$rndlast
 	aesd	$dat1,q14
 	aesd	$dat2,q14
 	aesimc	$dat1,$dat1
 	aesimc	$dat2,$dat2
 	 veor	$tmp2,$in1,$rndlast
 	aesd	$dat1,q15
 	aesd	$dat2,q15
 	b.eq	.Lcbc_dec_one
 	veor	$tmp1,$tmp1,$dat1
 	veor	$tmp2,$tmp2,$dat2
 	 vorr	$ivec,$in2,$in2
 	vst1.8	{$tmp1},[$out],#16
 	vst1.8	{$tmp2},[$out],#16
 	b	.Lcbc_done

 .Lcbc_dec_one:
 	veor	$tmp1,$tmp1,$dat2
 	 vorr	$ivec,$in2,$in2
 	vst1.8	{$tmp1},[$out],#16

 .Lcbc_done:
 	vst1.8	{$ivec},[$ivp]
 .Lcbc_abort:
 ___
 }
 $code.=<<___	if ($flavour !~ /64/);
 	vldmia	sp!,{d8-d15}
 	ldmia	sp!,{r4-r8,pc}
 ___
 $code.=<<___	if ($flavour =~ /64/);
 	ldr	x29,[sp],#16
 	ret
 ___
 $code.=<<___;
 .size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
 ___
 }}}
 {{{
 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
 my ($rounds,$cnt,$key_)=("w5","w6","x7");
 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
 my $step="x12";		# aliases with $tctr2

 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));

 my ($dat,$tmp)=($dat0,$tmp0);

 ### q8-q15	preloaded key schedule

 $code.=<<___;
 .globl	${prefix}_ctr32_encrypt_blocks
 .type	${prefix}_ctr32_encrypt_blocks,%function
 .align	5
 ${prefix}_ctr32_encrypt_blocks:
 ___
 $code.=<<___	if ($flavour =~ /64/);
 	stp		x29,x30,[sp,#-16]!
 	add		x29,sp,#0
 ___
 $code.=<<___	if ($flavour !~ /64/);
 	mov		ip,sp
 	stmdb		sp!,{r4-r10,lr}
 	vstmdb		sp!,{d8-d15}            @ ABI specification says so
 	ldr		r4, [ip]		@ load remaining arg
 ___
 $code.=<<___;
 	ldr		$rounds,[$key,#240]

 	ldr		$ctr, [$ivp, #12]
 	vld1.32		{$dat0},[$ivp]

 	vld1.32		{q8-q9},[$key]		// load key schedule...
 	sub		$rounds,$rounds,#4
 	mov		$step,#16
 	cmp		$len,#2
 	add		$key_,$key,x5,lsl#4	// pointer to last 5 round keys
 	sub		$rounds,$rounds,#2
 	vld1.32		{q12-q13},[$key_],#32
 	vld1.32		{q14-q15},[$key_],#32
 	vld1.32		{$rndlast},[$key_]
 	add		$key_,$key,#32
 	mov		$cnt,$rounds
 	cclr		$step,lo
 #ifndef __ARMEB__
 	rev		$ctr, $ctr
 #endif
 	vorr		$dat1,$dat0,$dat0
 	add		$tctr1, $ctr, #1
 	vorr		$dat2,$dat0,$dat0
 	add		$ctr, $ctr, #2
 	vorr		$ivec,$dat0,$dat0
 	rev		$tctr1, $tctr1
 	vmov.32		${dat1}[3],$tctr1
 	b.ls		.Lctr32_tail
 	rev		$tctr2, $ctr
 	sub		$len,$len,#3		// bias
 	vmov.32		${dat2}[3],$tctr2
 	b		.Loop3x_ctr32

 .align	4
 .Loop3x_ctr32:
 	aese		$dat0,q8
 	aese		$dat1,q8
 	aese		$dat2,q8
 	vld1.32		{q8},[$key_],#16
 	aesmc		$dat0,$dat0
 	aesmc		$dat1,$dat1
 	aesmc		$dat2,$dat2
 	subs		$cnt,$cnt,#2
 	aese		$dat0,q9
 	aese		$dat1,q9
 	aese		$dat2,q9
 	vld1.32		{q9},[$key_],#16
 	aesmc		$dat0,$dat0
 	aesmc		$dat1,$dat1
 	aesmc		$dat2,$dat2
 	b.gt		.Loop3x_ctr32

 	aese		$dat0,q8
 	aese		$dat1,q8
 	aese		$dat2,q8
 	 mov		$key_,$key
 	aesmc		$tmp0,$dat0
 	 vld1.8		{$in0},[$inp],#16
 	aesmc		$tmp1,$dat1
 	aesmc		$dat2,$dat2
 	 vorr		$dat0,$ivec,$ivec
 	aese		$tmp0,q9
 	 vld1.8		{$in1},[$inp],#16
 	aese		$tmp1,q9
 	aese		$dat2,q9
 	 vorr		$dat1,$ivec,$ivec
 	aesmc		$tmp0,$tmp0
 	 vld1.8		{$in2},[$inp],#16
 	aesmc		$tmp1,$tmp1
 	aesmc		$tmp2,$dat2
 	 vorr		$dat2,$ivec,$ivec
 	 add		$tctr0,$ctr,#1
 	aese		$tmp0,q12
 	aese		$tmp1,q12
 	aese		$tmp2,q12
 	 veor		$in0,$in0,$rndlast
 	 add		$tctr1,$ctr,#2
 	aesmc		$tmp0,$tmp0
 	aesmc		$tmp1,$tmp1
 	aesmc		$tmp2,$tmp2
 	 veor		$in1,$in1,$rndlast
 	 add		$ctr,$ctr,#3
 	aese		$tmp0,q13
 	aese		$tmp1,q13
 	aese		$tmp2,q13
 	 veor		$in2,$in2,$rndlast
 	 rev		$tctr0,$tctr0
 	aesmc		$tmp0,$tmp0
 	 vld1.32	 {q8},[$key_],#16	// re-pre-load rndkey[0]
 	aesmc		$tmp1,$tmp1
 	aesmc		$tmp2,$tmp2
 	 vmov.32	${dat0}[3], $tctr0
 	 rev		$tctr1,$tctr1
 	aese		$tmp0,q14
 	aese		$tmp1,q14
 	aese		$tmp2,q14
 	 vmov.32	${dat1}[3], $tctr1
 	 rev		$tctr2,$ctr
 	aesmc		$tmp0,$tmp0
 	aesmc		$tmp1,$tmp1
 	aesmc		$tmp2,$tmp2
 	 vmov.32	${dat2}[3], $tctr2
 	 subs		$len,$len,#3
 	aese		$tmp0,q15
 	aese		$tmp1,q15
 	aese		$tmp2,q15

 	 mov		$cnt,$rounds
 	veor		$in0,$in0,$tmp0
 	veor		$in1,$in1,$tmp1
 	veor		$in2,$in2,$tmp2
 	 vld1.32	 {q9},[$key_],#16	// re-pre-load rndkey[1]
 	vst1.8		{$in0},[$out],#16
 	vst1.8		{$in1},[$out],#16
 	vst1.8		{$in2},[$out],#16
 	b.hs		.Loop3x_ctr32

 	adds		$len,$len,#3
 	b.eq		.Lctr32_done
 	cmp		$len,#1
 	mov		$step,#16
 	cclr		$step,eq

 .Lctr32_tail:
 	aese		$dat0,q8
 	aese		$dat1,q8
 	vld1.32		{q8},[$key_],#16
 	aesmc		$dat0,$dat0
 	aesmc		$dat1,$dat1
 	subs		$cnt,$cnt,#2
 	aese		$dat0,q9
 	aese		$dat1,q9
 	vld1.32		{q9},[$key_],#16
 	aesmc		$dat0,$dat0
 	aesmc		$dat1,$dat1
 	b.gt		.Lctr32_tail

 	aese		$dat0,q8
 	aese		$dat1,q8
 	aesmc		$dat0,$dat0
 	aesmc		$dat1,$dat1
 	aese		$dat0,q9
 	aese		$dat1,q9
 	aesmc		$dat0,$dat0
 	aesmc		$dat1,$dat1
 	 vld1.8		{$in0},[$inp],$step
 	aese		$dat0,q12
 	aese		$dat1,q12
 	 vld1.8		{$in1},[$inp]
 	aesmc		$dat0,$dat0
 	aesmc		$dat1,$dat1
 	aese		$dat0,q13
 	aese		$dat1,q13
 	aesmc		$dat0,$dat0
 	aesmc		$dat1,$dat1
 	aese		$dat0,q14
 	aese		$dat1,q14
 	 veor		$in0,$in0,$rndlast
 	aesmc		$dat0,$dat0
 	aesmc		$dat1,$dat1
 	 veor		$in1,$in1,$rndlast
 	aese		$dat0,q15
 	aese		$dat1,q15

 	cmp		$len,#1
 	veor		$in0,$in0,$dat0
 	veor		$in1,$in1,$dat1
 	vst1.8		{$in0},[$out],#16
 	b.eq		.Lctr32_done
 	vst1.8		{$in1},[$out]

 .Lctr32_done:
 ___
 $code.=<<___	if ($flavour !~ /64/);
 	vldmia		sp!,{d8-d15}
 	ldmia		sp!,{r4-r10,pc}
 ___
 $code.=<<___	if ($flavour =~ /64/);
 	ldr		x29,[sp],#16
 	ret
 ___
 $code.=<<___;
 .size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
 ___
 }}}
 $code.=<<___;
 #endif
 ___
 ########################################
 if ($flavour =~ /64/) {			######## 64-bit code
     my %opcode = (
 	"aesd"	=>	0x4e285800,	"aese"	=>	0x4e284800,
 	"aesimc"=>	0x4e287800,	"aesmc"	=>	0x4e286800	);

     local *unaes = sub {
 	my ($mnemonic,$arg)=@_;

 	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o	&&
 	sprintf ".inst\t0x%08x\t//%s %s",
 			$opcode{$mnemonic}|$1|($2<<5),
 			$mnemonic,$arg;
     };

     foreach(split("\n",$code)) {
 	s/\`([^\`]*)\`/eval($1)/geo;

 	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
 	s/@\s/\/\//o;			# old->new style commentary

 	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
 	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
 	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel	$2,$3,$2,$1/o	or
 	s/vmov\.i8/movi/o	or	# fix up legacy mnemonics
 	s/vext\.8/ext/o		or
 	s/vrev32\.8/rev32/o	or
 	s/vtst\.8/cmtst/o	or
 	s/vshr/ushr/o		or
 	s/^(\s+)v/$1/o		or	# strip off v prefix
 	s/\bbx\s+lr\b/ret/o;

 	# fix up remainig legacy suffixes
 	s/\.[ui]?8//o;
 	m/\],#8/o and s/\.16b/\.8b/go;
 	s/\.[ui]?32//o and s/\.16b/\.4s/go;
 	s/\.[ui]?64//o and s/\.16b/\.2d/go;
 	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;

 	print $_,"\n";
     }
 } else {				######## 32-bit code
     my %opcode = (
 	"aesd"	=>	0xf3b00340,	"aese"	=>	0xf3b00300,
 	"aesimc"=>	0xf3b003c0,	"aesmc"	=>	0xf3b00380	);

     local *unaes = sub {
 	my ($mnemonic,$arg)=@_;

 	if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
 	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
 					 |(($2&7)<<1) |(($2&8)<<2);
 	    # since ARMv7 instructions are always encoded little-endian.
 	    # correct solution is to use .inst directive, but older
 	    # assemblers don't implement it:-(
 	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
 			$word&0xff,($word>>8)&0xff,
 			($word>>16)&0xff,($word>>24)&0xff,
 			$mnemonic,$arg;
 	}
     };

     sub unvtbl {
 	my $arg=shift;

 	$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
 	sprintf	"vtbl.8	d%d,{q%d},d%d\n\t".
 		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
     }

     sub unvdup32 {
 	my $arg=shift;

 	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
 	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
     }

     sub unvmov32 {
 	my $arg=shift;

 	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
 	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
     }

     foreach(split("\n",$code)) {
 	s/\`([^\`]*)\`/eval($1)/geo;

 	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
 	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
 	s/\/\/\s?/@ /o;				# new->old style commentary

 	# fix up remainig new-style suffixes
 	s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo	or
 	s/\],#[0-9]+/]!/o;

 	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
 	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o	or
 	s/vtbl\.8\s+(.*)/unvtbl($1)/geo			or
 	s/vdup\.32\s+(.*)/unvdup32($1)/geo		or
 	s/vmov\.32\s+(.*)/unvmov32($1)/geo		or
 	s/^(\s+)b\./$1b/o				or
 	s/^(\s+)mov\./$1mov/o				or
 	s/^(\s+)ret/$1bx\tlr/o;

 	print $_,"\n";
     }
 }

 close STDOUT;
	#!/usr/bin/env perl
	#
	# ====================================================================
	# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
	# project. The module is, however, dual licensed under OpenSSL and
	# CRYPTOGAMS licenses depending on where you obtain it. For further
	# details see http://www.openssl.org/~appro/cryptogams/.
	# ====================================================================
	#
	# This module implements support for ARMv8 AES instructions. The
	# module is endian-agnostic in sense that it supports both big- and
	# little-endian cases. As does it support both 32- and 64-bit modes
	# of operation. Latter is achieved by limiting amount of utilized
	# registers to 16, which implies additional NEON load and integer
	# instructions. This has no effect on mighty Apple A7, where results
	# are literally equal to the theoretical estimates based on AES
	# instruction latencies and issue rates. On Cortex-A53, an in-order
	# execution core, this costs up to 10-15%, which is partially
	# compensated by implementing dedicated code path for 128-bit
	# CBC encrypt case. On Cortex-A57 parallelizable mode performance
	# seems to be limited by sheer amount of NEON instructions...
	#
	# Performance in cycles per byte processed with 128-bit key:
	#
	# CBC enc CBC dec CTR
	# Apple A7 2.39 1.20 1.20
	# Cortex-A53 2.45 1.87 1.94
	# Cortex-A57 3.64 1.34 1.32

	$flavour = shift;
	open STDOUT,">".shift;

	$prefix="aes_v8";

	$code=<<___;
	#include "arm_arch.h"

	#if __ARM_MAX_ARCH__>=7
	.text
	___
	$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
	$code.=".arch armv7-a\n.fpu neon\n.code 32\n" if ($flavour !~ /64/);
	#^^^^^^ this is done to simplify adoption by not depending
	# on latest binutils.

	# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
	# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
	# maintain both 32- and 64-bit codes within single module and
	# transliterate common code to either flavour with regex vodoo.
	#
	{{{
	my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
	my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));


	$code.=<<___;
	.align 5
	rcon:
	.long 0x01,0x01,0x01,0x01
	.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
	.long 0x1b,0x1b,0x1b,0x1b

	.globl ${prefix}_set_encrypt_key
	.type ${prefix}_set_encrypt_key,%function
	.align 5
	${prefix}_set_encrypt_key:
	.Lenc_key:
	___
	$code.=<<___ if ($flavour =~ /64/);
	stp x29,x30,[sp,#-16]!
	add x29,sp,#0
	___
	$code.=<<___;
	mov $ptr,#-1
	cmp $inp,#0
	b.eq .Lenc_key_abort
	cmp $out,#0
	b.eq .Lenc_key_abort
	mov $ptr,#-2
	cmp $bits,#128
	b.lt .Lenc_key_abort
	cmp $bits,#256
	b.gt .Lenc_key_abort
	tst $bits,#0x3f
	b.ne .Lenc_key_abort

	adr $ptr,rcon
	cmp $bits,#192

	veor $zero,$zero,$zero
	vld1.8 {$in0},[$inp],#16
	mov $bits,#8 // reuse $bits
	vld1.32 {$rcon,$mask},[$ptr],#32

	b.lt .Loop128
	b.eq .L192
	b .L256

	.align 4
	.Loop128:
	vtbl.8 $key,{$in0},$mask
	vext.8 $tmp,$zero,$in0,#12
	vst1.32 {$in0},[$out],#16
	aese $key,$zero
	subs $bits,$bits,#1

	veor $in0,$in0,$tmp
	vext.8 $tmp,$zero,$tmp,#12
	veor $in0,$in0,$tmp
	vext.8 $tmp,$zero,$tmp,#12
	veor $key,$key,$rcon
	veor $in0,$in0,$tmp
	vshl.u8 $rcon,$rcon,#1
	veor $in0,$in0,$key
	b.ne .Loop128

	vld1.32 {$rcon},[$ptr]

	vtbl.8 $key,{$in0},$mask
	vext.8 $tmp,$zero,$in0,#12
	vst1.32 {$in0},[$out],#16
	aese $key,$zero

	veor $in0,$in0,$tmp
	vext.8 $tmp,$zero,$tmp,#12
	veor $in0,$in0,$tmp
	vext.8 $tmp,$zero,$tmp,#12
	veor $key,$key,$rcon
	veor $in0,$in0,$tmp
	vshl.u8 $rcon,$rcon,#1
	veor $in0,$in0,$key

	vtbl.8 $key,{$in0},$mask
	vext.8 $tmp,$zero,$in0,#12
	vst1.32 {$in0},[$out],#16
	aese $key,$zero

	veor $in0,$in0,$tmp
	vext.8 $tmp,$zero,$tmp,#12
	veor $in0,$in0,$tmp
	vext.8 $tmp,$zero,$tmp,#12
	veor $key,$key,$rcon
	veor $in0,$in0,$tmp
	veor $in0,$in0,$key
	vst1.32 {$in0},[$out]
	add $out,$out,#0x50

	mov $rounds,#10
	b .Ldone

	.align 4
	.L192:
	vld1.8 {$in1},[$inp],#8
	vmov.i8 $key,#8 // borrow $key
	vst1.32 {$in0},[$out],#16
	vsub.i8 $mask,$mask,$key // adjust the mask

	.Loop192:
	vtbl.8 $key,{$in1},$mask
	vext.8 $tmp,$zero,$in0,#12
	vst1.32 {$in1},[$out],#8
	aese $key,$zero
	subs $bits,$bits,#1

	veor $in0,$in0,$tmp
	vext.8 $tmp,$zero,$tmp,#12
	veor $in0,$in0,$tmp
	vext.8 $tmp,$zero,$tmp,#12
	veor $in0,$in0,$tmp

	vdup.32 $tmp,${in0}[3]
	veor $tmp,$tmp,$in1
	veor $key,$key,$rcon
	vext.8 $in1,$zero,$in1,#12
	vshl.u8 $rcon,$rcon,#1
	veor $in1,$in1,$tmp
	veor $in0,$in0,$key
	veor $in1,$in1,$key
	vst1.32 {$in0},[$out],#16
	b.ne .Loop192

	mov $rounds,#12
	add $out,$out,#0x20
	b .Ldone

	.align 4
	.L256:
	vld1.8 {$in1},[$inp]
	mov $bits,#7
	mov $rounds,#14
	vst1.32 {$in0},[$out],#16

	.Loop256:
	vtbl.8 $key,{$in1},$mask
	vext.8 $tmp,$zero,$in0,#12
	vst1.32 {$in1},[$out],#16
	aese $key,$zero
	subs $bits,$bits,#1

	veor $in0,$in0,$tmp
	vext.8 $tmp,$zero,$tmp,#12
	veor $in0,$in0,$tmp
	vext.8 $tmp,$zero,$tmp,#12
	veor $key,$key,$rcon
	veor $in0,$in0,$tmp
	vshl.u8 $rcon,$rcon,#1
	veor $in0,$in0,$key
	vst1.32 {$in0},[$out],#16
	b.eq .Ldone

	vdup.32 $key,${in0}[3] // just splat
	vext.8 $tmp,$zero,$in1,#12
	aese $key,$zero

	veor $in1,$in1,$tmp
	vext.8 $tmp,$zero,$tmp,#12
	veor $in1,$in1,$tmp
	vext.8 $tmp,$zero,$tmp,#12
	veor $in1,$in1,$tmp

	veor $in1,$in1,$key
	b .Loop256

	.Ldone:
	str $rounds,[$out]
	mov $ptr,#0

	.Lenc_key_abort:
	mov x0,$ptr // return value
	`"ldr x29,[sp],#16" if ($flavour =~ /64/)`
	ret
	.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key

	.globl ${prefix}_set_decrypt_key
	.type ${prefix}_set_decrypt_key,%function
	.align 5
	${prefix}_set_decrypt_key:
	___
	$code.=<<___ if ($flavour =~ /64/);
	stp x29,x30,[sp,#-16]!
	add x29,sp,#0
	___
	$code.=<<___ if ($flavour !~ /64/);
	stmdb sp!,{r4,lr}
	___
	$code.=<<___;
	bl .Lenc_key

	cmp x0,#0
	b.ne .Ldec_key_abort

	sub $out,$out,#240 // restore original $out
	mov x4,#-16
	add $inp,$out,x12,lsl#4 // end of key schedule

	vld1.32 {v0.16b},[$out]
	vld1.32 {v1.16b},[$inp]
	vst1.32 {v0.16b},[$inp],x4
	vst1.32 {v1.16b},[$out],#16

	.Loop_imc:
	vld1.32 {v0.16b},[$out]
	vld1.32 {v1.16b},[$inp]
	aesimc v0.16b,v0.16b
	aesimc v1.16b,v1.16b
	vst1.32 {v0.16b},[$inp],x4
	vst1.32 {v1.16b},[$out],#16
	cmp $inp,$out
	b.hi .Loop_imc

	vld1.32 {v0.16b},[$out]
	aesimc v0.16b,v0.16b
	vst1.32 {v0.16b},[$inp]

	eor x0,x0,x0 // return value
	.Ldec_key_abort:
	___
	$code.=<<___ if ($flavour !~ /64/);
	ldmia sp!,{r4,pc}
	___
	$code.=<<___ if ($flavour =~ /64/);
	ldp x29,x30,[sp],#16
	ret
	___
	$code.=<<___;
	.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
	___
	}}}
	{{{
	sub gen_block () {
	my $dir = shift;
	my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
	my ($inp,$out,$key)=map("x$_",(0..2));
	my $rounds="w3";
	my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));

	$code.=<<___;
	.globl ${prefix}_${dir}crypt
	.type ${prefix}_${dir}crypt,%function
	.align 5
	${prefix}_${dir}crypt:
	ldr $rounds,[$key,#240]
	vld1.32 {$rndkey0},[$key],#16
	vld1.8 {$inout},[$inp]
	sub $rounds,$rounds,#2
	vld1.32 {$rndkey1},[$key],#16

	.Loop_${dir}c:
	aes$e $inout,$rndkey0
	vld1.32 {$rndkey0},[$key],#16
	aes$mc $inout,$inout
	subs $rounds,$rounds,#2
	aes$e $inout,$rndkey1
	vld1.32 {$rndkey1},[$key],#16
	aes$mc $inout,$inout
	b.gt .Loop_${dir}c

	aes$e $inout,$rndkey0
	vld1.32 {$rndkey0},[$key]
	aes$mc $inout,$inout
	aes$e $inout,$rndkey1
	veor $inout,$inout,$rndkey0

	vst1.8 {$inout},[$out]
	ret
	.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
	___
	}
	&gen_block("en");
	&gen_block("de");
	}}}
	{{{
	my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
	my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
	my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));

	my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);

	### q8-q15 preloaded key schedule

	$code.=<<___;
	.globl ${prefix}_cbc_encrypt
	.type ${prefix}_cbc_encrypt,%function
	.align 5
	${prefix}_cbc_encrypt:
	___
	$code.=<<___ if ($flavour =~ /64/);
	stp x29,x30,[sp,#-16]!
	add x29,sp,#0
	___
	$code.=<<___ if ($flavour !~ /64/);
	mov ip,sp
	stmdb sp!,{r4-r8,lr}
	vstmdb sp!,{d8-d15} @ ABI specification says so
	ldmia ip,{r4-r5} @ load remaining args
	___
	$code.=<<___;
	subs $len,$len,#16
	mov $step,#16
	b.lo .Lcbc_abort
	cclr $step,eq

	cmp $enc,#0 // en- or decrypting?
	ldr $rounds,[$key,#240]
	and $len,$len,#-16
	vld1.8 {$ivec},[$ivp]
	vld1.8 {$dat},[$inp],$step

	vld1.32 {q8-q9},[$key] // load key schedule...
	sub $rounds,$rounds,#6
	add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
	sub $rounds,$rounds,#2
	vld1.32 {q10-q11},[$key_],#32
	vld1.32 {q12-q13},[$key_],#32
	vld1.32 {q14-q15},[$key_],#32
	vld1.32 {$rndlast},[$key_]

	add $key_,$key,#32
	mov $cnt,$rounds
	b.eq .Lcbc_dec

	cmp $rounds,#2
	veor $dat,$dat,$ivec
	veor $rndzero_n_last,q8,$rndlast
	b.eq .Lcbc_enc128

	.Loop_cbc_enc:
	aese $dat,q8
	vld1.32 {q8},[$key_],#16
	aesmc $dat,$dat
	subs $cnt,$cnt,#2
	aese $dat,q9
	vld1.32 {q9},[$key_],#16
	aesmc $dat,$dat
	b.gt .Loop_cbc_enc

	aese $dat,q8
	aesmc $dat,$dat
	subs $len,$len,#16
	aese $dat,q9
	aesmc $dat,$dat
	cclr $step,eq
	aese $dat,q10
	aesmc $dat,$dat
	add $key_,$key,#16
	aese $dat,q11
	aesmc $dat,$dat
	vld1.8 {q8},[$inp],$step
	aese $dat,q12
	aesmc $dat,$dat
	veor q8,q8,$rndzero_n_last
	aese $dat,q13
	aesmc $dat,$dat
	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
	aese $dat,q14
	aesmc $dat,$dat
	aese $dat,q15

	mov $cnt,$rounds
	veor $ivec,$dat,$rndlast
	vst1.8 {$ivec},[$out],#16
	b.hs .Loop_cbc_enc

	b .Lcbc_done

	.align 5
	.Lcbc_enc128:
	vld1.32 {$in0-$in1},[$key_]
	aese $dat,q8
	aesmc $dat,$dat
	b .Lenter_cbc_enc128
	.Loop_cbc_enc128:
	aese $dat,q8
	aesmc $dat,$dat
	vst1.8 {$ivec},[$out],#16
	.Lenter_cbc_enc128:
	aese $dat,q9
	aesmc $dat,$dat
	subs $len,$len,#16
	aese $dat,$in0
	aesmc $dat,$dat
	cclr $step,eq
	aese $dat,$in1
	aesmc $dat,$dat
	aese $dat,q10
	aesmc $dat,$dat
	aese $dat,q11
	aesmc $dat,$dat
	vld1.8 {q8},[$inp],$step
	aese $dat,q12
	aesmc $dat,$dat
	aese $dat,q13
	aesmc $dat,$dat
	aese $dat,q14
	aesmc $dat,$dat
	veor q8,q8,$rndzero_n_last
	aese $dat,q15
	veor $ivec,$dat,$rndlast
	b.hs .Loop_cbc_enc128

	vst1.8 {$ivec},[$out],#16
	b .Lcbc_done
	___
	{
	my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
	$code.=<<___;
	.align 5
	.Lcbc_dec:
	vld1.8 {$dat2},[$inp],#16
	subs $len,$len,#32 // bias
	add $cnt,$rounds,#2
	vorr $in1,$dat,$dat
	vorr $dat1,$dat,$dat
	vorr $in2,$dat2,$dat2
	b.lo .Lcbc_dec_tail

	vorr $dat1,$dat2,$dat2
	vld1.8 {$dat2},[$inp],#16
	vorr $in0,$dat,$dat
	vorr $in1,$dat1,$dat1
	vorr $in2,$dat2,$dat2

	.Loop3x_cbc_dec:
	aesd $dat0,q8
	aesd $dat1,q8
	aesd $dat2,q8
	vld1.32 {q8},[$key_],#16
	aesimc $dat0,$dat0
	aesimc $dat1,$dat1
	aesimc $dat2,$dat2
	subs $cnt,$cnt,#2
	aesd $dat0,q9
	aesd $dat1,q9
	aesd $dat2,q9
	vld1.32 {q9},[$key_],#16
	aesimc $dat0,$dat0
	aesimc $dat1,$dat1
	aesimc $dat2,$dat2
	b.gt .Loop3x_cbc_dec

	aesd $dat0,q8
	aesd $dat1,q8
	aesd $dat2,q8
	veor $tmp0,$ivec,$rndlast
	aesimc $dat0,$dat0
	aesimc $dat1,$dat1
	aesimc $dat2,$dat2
	veor $tmp1,$in0,$rndlast
	aesd $dat0,q9
	aesd $dat1,q9
	aesd $dat2,q9
	veor $tmp2,$in1,$rndlast
	subs $len,$len,#0x30
	aesimc $dat0,$dat0
	aesimc $dat1,$dat1
	aesimc $dat2,$dat2
	vorr $ivec,$in2,$in2
	mov.lo x6,$len // x6, $cnt, is zero at this point
	aesd $dat0,q12
	aesd $dat1,q12
	aesd $dat2,q12
	add $inp,$inp,x6 // $inp is adjusted in such way that
	// at exit from the loop $dat1-$dat2
	// are loaded with last "words"
	aesimc $dat0,$dat0
	aesimc $dat1,$dat1
	aesimc $dat2,$dat2
	mov $key_,$key
	aesd $dat0,q13
	aesd $dat1,q13
	aesd $dat2,q13
	vld1.8 {$in0},[$inp],#16
	aesimc $dat0,$dat0
	aesimc $dat1,$dat1
	aesimc $dat2,$dat2
	vld1.8 {$in1},[$inp],#16
	aesd $dat0,q14
	aesd $dat1,q14
	aesd $dat2,q14
	vld1.8 {$in2},[$inp],#16
	aesimc $dat0,$dat0
	aesimc $dat1,$dat1
	aesimc $dat2,$dat2
	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
	aesd $dat0,q15
	aesd $dat1,q15
	aesd $dat2,q15

	add $cnt,$rounds,#2
	veor $tmp0,$tmp0,$dat0
	veor $tmp1,$tmp1,$dat1
	veor $dat2,$dat2,$tmp2
	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
	vorr $dat0,$in0,$in0
	vst1.8 {$tmp0},[$out],#16
	vorr $dat1,$in1,$in1
	vst1.8 {$tmp1},[$out],#16
	vst1.8 {$dat2},[$out],#16
	vorr $dat2,$in2,$in2
	b.hs .Loop3x_cbc_dec

	cmn $len,#0x30
	b.eq .Lcbc_done
	nop

	.Lcbc_dec_tail:
	aesd $dat1,q8
	aesd $dat2,q8
	vld1.32 {q8},[$key_],#16
	aesimc $dat1,$dat1
	aesimc $dat2,$dat2
	subs $cnt,$cnt,#2
	aesd $dat1,q9
	aesd $dat2,q9
	vld1.32 {q9},[$key_],#16
	aesimc $dat1,$dat1
	aesimc $dat2,$dat2
	b.gt .Lcbc_dec_tail

	aesd $dat1,q8
	aesd $dat2,q8
	aesimc $dat1,$dat1
	aesimc $dat2,$dat2
	aesd $dat1,q9
	aesd $dat2,q9
	aesimc $dat1,$dat1
	aesimc $dat2,$dat2
	aesd $dat1,q12
	aesd $dat2,q12
	aesimc $dat1,$dat1
	aesimc $dat2,$dat2
	cmn $len,#0x20
	aesd $dat1,q13
	aesd $dat2,q13
	aesimc $dat1,$dat1
	aesimc $dat2,$dat2
	veor $tmp1,$ivec,$rndlast
	aesd $dat1,q14
	aesd $dat2,q14
	aesimc $dat1,$dat1
	aesimc $dat2,$dat2
	veor $tmp2,$in1,$rndlast
	aesd $dat1,q15
	aesd $dat2,q15
	b.eq .Lcbc_dec_one
	veor $tmp1,$tmp1,$dat1
	veor $tmp2,$tmp2,$dat2
	vorr $ivec,$in2,$in2
	vst1.8 {$tmp1},[$out],#16
	vst1.8 {$tmp2},[$out],#16
	b .Lcbc_done

	.Lcbc_dec_one:
	veor $tmp1,$tmp1,$dat2
	vorr $ivec,$in2,$in2
	vst1.8 {$tmp1},[$out],#16

	.Lcbc_done:
	vst1.8 {$ivec},[$ivp]
	.Lcbc_abort:
	___
	}
	$code.=<<___ if ($flavour !~ /64/);
	vldmia sp!,{d8-d15}
	ldmia sp!,{r4-r8,pc}
	___
	$code.=<<___ if ($flavour =~ /64/);
	ldr x29,[sp],#16
	ret
	___
	$code.=<<___;
	.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
	___
	}}}
	{{{
	my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
	my ($rounds,$cnt,$key_)=("w5","w6","x7");
	my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
	my $step="x12"; # aliases with $tctr2

	my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
	my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));

	my ($dat,$tmp)=($dat0,$tmp0);

	### q8-q15 preloaded key schedule

	$code.=<<___;
	.globl ${prefix}_ctr32_encrypt_blocks
	.type ${prefix}_ctr32_encrypt_blocks,%function
	.align 5
	${prefix}_ctr32_encrypt_blocks:
	___
	$code.=<<___ if ($flavour =~ /64/);
	stp x29,x30,[sp,#-16]!
	add x29,sp,#0
	___
	$code.=<<___ if ($flavour !~ /64/);
	mov ip,sp
	stmdb sp!,{r4-r10,lr}
	vstmdb sp!,{d8-d15} @ ABI specification says so
	ldr r4, [ip] @ load remaining arg
	___
	$code.=<<___;
	ldr $rounds,[$key,#240]

	ldr $ctr, [$ivp, #12]
	vld1.32 {$dat0},[$ivp]

	vld1.32 {q8-q9},[$key] // load key schedule...
	sub $rounds,$rounds,#4
	mov $step,#16
	cmp $len,#2
	add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
	sub $rounds,$rounds,#2
	vld1.32 {q12-q13},[$key_],#32
	vld1.32 {q14-q15},[$key_],#32
	vld1.32 {$rndlast},[$key_]
	add $key_,$key,#32
	mov $cnt,$rounds
	cclr $step,lo
	#ifndef __ARMEB__
	rev $ctr, $ctr
	#endif
	vorr $dat1,$dat0,$dat0
	add $tctr1, $ctr, #1
	vorr $dat2,$dat0,$dat0
	add $ctr, $ctr, #2
	vorr $ivec,$dat0,$dat0
	rev $tctr1, $tctr1
	vmov.32 ${dat1}[3],$tctr1
	b.ls .Lctr32_tail
	rev $tctr2, $ctr
	sub $len,$len,#3 // bias
	vmov.32 ${dat2}[3],$tctr2
	b .Loop3x_ctr32

	.align 4
	.Loop3x_ctr32:
	aese $dat0,q8
	aese $dat1,q8
	aese $dat2,q8
	vld1.32 {q8},[$key_],#16
	aesmc $dat0,$dat0
	aesmc $dat1,$dat1
	aesmc $dat2,$dat2
	subs $cnt,$cnt,#2
	aese $dat0,q9
	aese $dat1,q9
	aese $dat2,q9
	vld1.32 {q9},[$key_],#16
	aesmc $dat0,$dat0
	aesmc $dat1,$dat1
	aesmc $dat2,$dat2
	b.gt .Loop3x_ctr32

	aese $dat0,q8
	aese $dat1,q8
	aese $dat2,q8
	mov $key_,$key
	aesmc $tmp0,$dat0
	vld1.8 {$in0},[$inp],#16
	aesmc $tmp1,$dat1
	aesmc $dat2,$dat2
	vorr $dat0,$ivec,$ivec
	aese $tmp0,q9
	vld1.8 {$in1},[$inp],#16
	aese $tmp1,q9
	aese $dat2,q9
	vorr $dat1,$ivec,$ivec
	aesmc $tmp0,$tmp0
	vld1.8 {$in2},[$inp],#16
	aesmc $tmp1,$tmp1
	aesmc $tmp2,$dat2
	vorr $dat2,$ivec,$ivec
	add $tctr0,$ctr,#1
	aese $tmp0,q12
	aese $tmp1,q12
	aese $tmp2,q12
	veor $in0,$in0,$rndlast
	add $tctr1,$ctr,#2
	aesmc $tmp0,$tmp0
	aesmc $tmp1,$tmp1
	aesmc $tmp2,$tmp2
	veor $in1,$in1,$rndlast
	add $ctr,$ctr,#3
	aese $tmp0,q13
	aese $tmp1,q13
	aese $tmp2,q13
	veor $in2,$in2,$rndlast
	rev $tctr0,$tctr0
	aesmc $tmp0,$tmp0
	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
	aesmc $tmp1,$tmp1
	aesmc $tmp2,$tmp2
	vmov.32 ${dat0}[3], $tctr0
	rev $tctr1,$tctr1
	aese $tmp0,q14
	aese $tmp1,q14
	aese $tmp2,q14
	vmov.32 ${dat1}[3], $tctr1
	rev $tctr2,$ctr
	aesmc $tmp0,$tmp0
	aesmc $tmp1,$tmp1
	aesmc $tmp2,$tmp2
	vmov.32 ${dat2}[3], $tctr2
	subs $len,$len,#3
	aese $tmp0,q15
	aese $tmp1,q15
	aese $tmp2,q15

	mov $cnt,$rounds
	veor $in0,$in0,$tmp0
	veor $in1,$in1,$tmp1
	veor $in2,$in2,$tmp2
	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
	vst1.8 {$in0},[$out],#16
	vst1.8 {$in1},[$out],#16
	vst1.8 {$in2},[$out],#16
	b.hs .Loop3x_ctr32

	adds $len,$len,#3
	b.eq .Lctr32_done
	cmp $len,#1
	mov $step,#16
	cclr $step,eq

	.Lctr32_tail:
	aese $dat0,q8
	aese $dat1,q8
	vld1.32 {q8},[$key_],#16
	aesmc $dat0,$dat0
	aesmc $dat1,$dat1
	subs $cnt,$cnt,#2
	aese $dat0,q9
	aese $dat1,q9
	vld1.32 {q9},[$key_],#16
	aesmc $dat0,$dat0
	aesmc $dat1,$dat1
	b.gt .Lctr32_tail

	aese $dat0,q8
	aese $dat1,q8
	aesmc $dat0,$dat0
	aesmc $dat1,$dat1
	aese $dat0,q9
	aese $dat1,q9
	aesmc $dat0,$dat0
	aesmc $dat1,$dat1
	vld1.8 {$in0},[$inp],$step
	aese $dat0,q12
	aese $dat1,q12
	vld1.8 {$in1},[$inp]
	aesmc $dat0,$dat0
	aesmc $dat1,$dat1
	aese $dat0,q13
	aese $dat1,q13
	aesmc $dat0,$dat0
	aesmc $dat1,$dat1
	aese $dat0,q14
	aese $dat1,q14
	veor $in0,$in0,$rndlast
	aesmc $dat0,$dat0
	aesmc $dat1,$dat1
	veor $in1,$in1,$rndlast
	aese $dat0,q15
	aese $dat1,q15

	cmp $len,#1
	veor $in0,$in0,$dat0
	veor $in1,$in1,$dat1
	vst1.8 {$in0},[$out],#16
	b.eq .Lctr32_done
	vst1.8 {$in1},[$out]

	.Lctr32_done:
	___
	$code.=<<___ if ($flavour !~ /64/);
	vldmia sp!,{d8-d15}
	ldmia sp!,{r4-r10,pc}
	___
	$code.=<<___ if ($flavour =~ /64/);
	ldr x29,[sp],#16
	ret
	___
	$code.=<<___;
	.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
	___
	}}}
	$code.=<<___;
	#endif
	___
	########################################
	if ($flavour =~ /64/) { ######## 64-bit code
	my %opcode = (
	"aesd" => 0x4e285800, "aese" => 0x4e284800,
	"aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );

	local *unaes = sub {
	my ($mnemonic,$arg)=@_;

	$arg =~ m/[qv]([0-9]+)[^,],\s[qv]([0-9]+)/o &&
	sprintf ".inst\t0x%08x\t//%s %s",
	$opcode{$mnemonic}\|$1\|($2<<5),
	$mnemonic,$arg;
	};

	foreach(split("\n",$code)) {
	s/\`([^\`]*)\`/eval($1)/geo;

	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
	s/@\s/\/\//o; # old->new style commentary

	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
	s/vmov\.i8/movi/o or # fix up legacy mnemonics
	s/vext\.8/ext/o or
	s/vrev32\.8/rev32/o or
	s/vtst\.8/cmtst/o or
	s/vshr/ushr/o or
	s/^(\s+)v/$1/o or # strip off v prefix
	s/\bbx\s+lr\b/ret/o;

	# fix up remainig legacy suffixes
	s/\.[ui]?8//o;
	m/\],#8/o and s/\.16b/\.8b/go;
	s/\.[ui]?32//o and s/\.16b/\.4s/go;
	s/\.[ui]?64//o and s/\.16b/\.2d/go;
	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;

	print $_,"\n";
	}
	} else { ######## 32-bit code
	my %opcode = (
	"aesd" => 0xf3b00340, "aese" => 0xf3b00300,
	"aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );

	local *unaes = sub {
	my ($mnemonic,$arg)=@_;

	if ($arg =~ m/[qv]([0-9]+)[^,],\s[qv]([0-9]+)/o) {
	my $word = $opcode{$mnemonic}\|(($1&7)<<13)\|(($1&8)<<19)
	\|(($2&7)<<1) \|(($2&8)<<2);
	# since ARMv7 instructions are always encoded little-endian.
	# correct solution is to use .inst directive, but older
	# assemblers don't implement it:-(
	sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
	$word&0xff,($word>>8)&0xff,
	($word>>16)&0xff,($word>>24)&0xff,
	$mnemonic,$arg;
	}
	};

	sub unvtbl {
	my $arg=shift;

	$arg =~ m/q([0-9]+),\s\{q([0-9]+)\},\sq([0-9]+)/o &&
	sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
	"vtbl.8 d%d,{q%d},d%d", 2$1,$2,2$3, 2$1+1,$2,2$3+1;
	}

	sub unvdup32 {
	my $arg=shift;

	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
	sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
	}

	sub unvmov32 {
	my $arg=shift;

	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
	sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
	}

	foreach(split("\n",$code)) {
	s/\`([^\`]*)\`/eval($1)/geo;

	s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
	s/\/\/\s?/@ /o; # new->old style commentary

	# fix up remainig new-style suffixes
	s/\{q([0-9]+)\},\s\[(.+)\],#8/sprintf "{d%d},[$2]!",2$1/eo or
	s/\],#[0-9]+/]!/o;

	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
	s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
	s/vdup\.32\s+(.*)/unvdup32($1)/geo or
	s/vmov\.32\s+(.*)/unvmov32($1)/geo or
	s/^(\s+)b\./$1b/o or
	s/^(\s+)mov\./$1mov/o or
	s/^(\s+)ret/$1bx\tlr/o;

	print $_,"\n";
	}
	}

	close STDOUT;