| #! /usr/bin/env perl |
| # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. |
| # |
| # Licensed under the OpenSSL license (the "License"). You may not use |
| # this file except in compliance with the License. You can obtain a copy |
| # in the file LICENSE in the source distribution or at |
| # https://www.openssl.org/source/license.html |
| |
| # |
| # ==================================================================== |
| # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
| # project. The module is, however, dual licensed under OpenSSL and |
| # CRYPTOGAMS licenses depending on where you obtain it. For further |
| # details see http://www.openssl.org/~appro/cryptogams/. |
| # ==================================================================== |
| # |
| # December 2014 |
| # |
| # ChaCha20 for ARMv4. |
| # |
| # Performance in cycles per byte out of large buffer. |
| # |
| # IALU/gcc-4.4 1xNEON 3xNEON+1xIALU |
| # |
| # Cortex-A5 19.3(*)/+95% 21.8 14.1 |
| # Cortex-A8 10.5(*)/+160% 13.9 6.35 |
| # Cortex-A9 12.9(**)/+110% 14.3 6.50 |
| # Cortex-A15 11.0/+40% 16.0 5.00 |
| # Snapdragon S4 11.5/+125% 13.6 4.90 |
| # |
| # (*) most "favourable" result for aligned data on little-endian |
| # processor, result for misaligned data is 10-15% lower; |
| # (**) this result is a trade-off: it can be improved by 20%, |
| # but then Snapdragon S4 and Cortex-A8 results get |
| # 20-25% worse; |
| |
| $flavour = shift; |
| if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } |
| else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } |
| |
| if ($flavour && $flavour ne "void") { |
| $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or |
| ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or |
| die "can't locate arm-xlate.pl"; |
| |
| open OUT,"| \"$^X\" $xlate $flavour $output"; |
| *STDOUT=*OUT; |
| } else { |
| open OUT,">$output"; |
| *STDOUT=*OUT; |
| } |
| |
| sub AUTOLOAD() # thunk [simplified] x86-style perlasm |
| { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; |
| my $arg = pop; |
| $arg = "#$arg" if ($arg*1 eq $arg); |
| $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; |
| } |
| |
| my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x")); |
| my @t=map("r$_",(8..11)); |
| |
| sub ROUND { |
| my ($a0,$b0,$c0,$d0)=@_; |
| my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); |
| my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); |
| my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); |
| my $odd = $d0&1; |
| my ($xc,$xc_) = (@t[0..1]); |
| my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]); |
| my @ret; |
| |
| # Consider order in which variables are addressed by their |
| # index: |
| # |
| # a b c d |
| # |
| # 0 4 8 12 < even round |
| # 1 5 9 13 |
| # 2 6 10 14 |
| # 3 7 11 15 |
| # 0 5 10 15 < odd round |
| # 1 6 11 12 |
| # 2 7 8 13 |
| # 3 4 9 14 |
| # |
| # 'a', 'b' are permanently allocated in registers, @x[0..7], |
| # while 'c's and pair of 'd's are maintained in memory. If |
| # you observe 'c' column, you'll notice that pair of 'c's is |
| # invariant between rounds. This means that we have to reload |
| # them once per round, in the middle. This is why you'll see |
| # bunch of 'c' stores and loads in the middle, but none in |
| # the beginning or end. If you observe 'd' column, you'll |
| # notice that 15 and 13 are reused in next pair of rounds. |
| # This is why these two are chosen for offloading to memory, |
| # to make loads count more. |
| push @ret,( |
| "&add (@x[$a0],@x[$a0],@x[$b0])", |
| "&mov ($xd,$xd,'ror#16')", |
| "&add (@x[$a1],@x[$a1],@x[$b1])", |
| "&mov ($xd_,$xd_,'ror#16')", |
| "&eor ($xd,$xd,@x[$a0],'ror#16')", |
| "&eor ($xd_,$xd_,@x[$a1],'ror#16')", |
| |
| "&add ($xc,$xc,$xd)", |
| "&mov (@x[$b0],@x[$b0],'ror#20')", |
| "&add ($xc_,$xc_,$xd_)", |
| "&mov (@x[$b1],@x[$b1],'ror#20')", |
| "&eor (@x[$b0],@x[$b0],$xc,'ror#20')", |
| "&eor (@x[$b1],@x[$b1],$xc_,'ror#20')", |
| |
| "&add (@x[$a0],@x[$a0],@x[$b0])", |
| "&mov ($xd,$xd,'ror#24')", |
| "&add (@x[$a1],@x[$a1],@x[$b1])", |
| "&mov ($xd_,$xd_,'ror#24')", |
| "&eor ($xd,$xd,@x[$a0],'ror#24')", |
| "&eor ($xd_,$xd_,@x[$a1],'ror#24')", |
| |
| "&add ($xc,$xc,$xd)", |
| "&mov (@x[$b0],@x[$b0],'ror#25')" ); |
| push @ret,( |
| "&str ($xd,'[sp,#4*(16+$d0)]')", |
| "&ldr ($xd,'[sp,#4*(16+$d2)]')" ) if ($odd); |
| push @ret,( |
| "&add ($xc_,$xc_,$xd_)", |
| "&mov (@x[$b1],@x[$b1],'ror#25')" ); |
| push @ret,( |
| "&str ($xd_,'[sp,#4*(16+$d1)]')", |
| "&ldr ($xd_,'[sp,#4*(16+$d3)]')" ) if (!$odd); |
| push @ret,( |
| "&eor (@x[$b0],@x[$b0],$xc,'ror#25')", |
| "&eor (@x[$b1],@x[$b1],$xc_,'ror#25')" ); |
| |
| $xd=@x[$d2] if (!$odd); |
| $xd_=@x[$d3] if ($odd); |
| push @ret,( |
| "&str ($xc,'[sp,#4*(16+$c0)]')", |
| "&ldr ($xc,'[sp,#4*(16+$c2)]')", |
| "&add (@x[$a2],@x[$a2],@x[$b2])", |
| "&mov ($xd,$xd,'ror#16')", |
| "&str ($xc_,'[sp,#4*(16+$c1)]')", |
| "&ldr ($xc_,'[sp,#4*(16+$c3)]')", |
| "&add (@x[$a3],@x[$a3],@x[$b3])", |
| "&mov ($xd_,$xd_,'ror#16')", |
| "&eor ($xd,$xd,@x[$a2],'ror#16')", |
| "&eor ($xd_,$xd_,@x[$a3],'ror#16')", |
| |
| "&add ($xc,$xc,$xd)", |
| "&mov (@x[$b2],@x[$b2],'ror#20')", |
| "&add ($xc_,$xc_,$xd_)", |
| "&mov (@x[$b3],@x[$b3],'ror#20')", |
| "&eor (@x[$b2],@x[$b2],$xc,'ror#20')", |
| "&eor (@x[$b3],@x[$b3],$xc_,'ror#20')", |
| |
| "&add (@x[$a2],@x[$a2],@x[$b2])", |
| "&mov ($xd,$xd,'ror#24')", |
| "&add (@x[$a3],@x[$a3],@x[$b3])", |
| "&mov ($xd_,$xd_,'ror#24')", |
| "&eor ($xd,$xd,@x[$a2],'ror#24')", |
| "&eor ($xd_,$xd_,@x[$a3],'ror#24')", |
| |
| "&add ($xc,$xc,$xd)", |
| "&mov (@x[$b2],@x[$b2],'ror#25')", |
| "&add ($xc_,$xc_,$xd_)", |
| "&mov (@x[$b3],@x[$b3],'ror#25')", |
| "&eor (@x[$b2],@x[$b2],$xc,'ror#25')", |
| "&eor (@x[$b3],@x[$b3],$xc_,'ror#25')" ); |
| |
| @ret; |
| } |
| |
| $code.=<<___; |
| #include <openssl/arm_arch.h> |
| |
| @ Silence ARMv8 deprecated IT instruction warnings. This file is used by both |
| @ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. |
| .arch armv7-a |
| |
| .text |
| #if defined(__thumb2__) || defined(__clang__) |
| .syntax unified |
| #endif |
| #if defined(__thumb2__) |
| .thumb |
| #else |
| .code 32 |
| #endif |
| |
| #if defined(__thumb2__) || defined(__clang__) |
| #define ldrhsb ldrbhs |
| #endif |
| |
| .align 5 |
| .Lsigma: |
| .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral |
| .Lone: |
| .long 1,0,0,0 |
| #if __ARM_MAX_ARCH__>=7 |
| .LOPENSSL_armcap: |
| .word OPENSSL_armcap_P-.LChaCha20_ctr32 |
| #else |
| .word -1 |
| #endif |
| |
| .globl ChaCha20_ctr32 |
| .type ChaCha20_ctr32,%function |
| .align 5 |
| ChaCha20_ctr32: |
| .LChaCha20_ctr32: |
| ldr r12,[sp,#0] @ pull pointer to counter and nonce |
| stmdb sp!,{r0-r2,r4-r11,lr} |
| #if __ARM_ARCH__<7 && !defined(__thumb2__) |
| sub r14,pc,#16 @ ChaCha20_ctr32 |
| #else |
| adr r14,.LChaCha20_ctr32 |
| #endif |
| cmp r2,#0 @ len==0? |
| #ifdef __thumb2__ |
| itt eq |
| #endif |
| addeq sp,sp,#4*3 |
| beq .Lno_data |
| #if __ARM_MAX_ARCH__>=7 |
| cmp r2,#192 @ test len |
| bls .Lshort |
| ldr r4,[r14,#-32] |
| ldr r4,[r14,r4] |
| # ifdef __APPLE__ |
| ldr r4,[r4] |
| # endif |
| tst r4,#ARMV7_NEON |
| bne .LChaCha20_neon |
| .Lshort: |
| #endif |
| ldmia r12,{r4-r7} @ load counter and nonce |
| sub sp,sp,#4*(16) @ off-load area |
| sub r14,r14,#64 @ .Lsigma |
| stmdb sp!,{r4-r7} @ copy counter and nonce |
| ldmia r3,{r4-r11} @ load key |
| ldmia r14,{r0-r3} @ load sigma |
| stmdb sp!,{r4-r11} @ copy key |
| stmdb sp!,{r0-r3} @ copy sigma |
| str r10,[sp,#4*(16+10)] @ off-load "@x[10]" |
| str r11,[sp,#4*(16+11)] @ off-load "@x[11]" |
| b .Loop_outer_enter |
| |
| .align 4 |
| .Loop_outer: |
| ldmia sp,{r0-r9} @ load key material |
| str @t[3],[sp,#4*(32+2)] @ save len |
| str r12, [sp,#4*(32+1)] @ save inp |
| str r14, [sp,#4*(32+0)] @ save out |
| .Loop_outer_enter: |
| ldr @t[3], [sp,#4*(15)] |
| ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load |
| ldr @t[2], [sp,#4*(13)] |
| ldr @x[14],[sp,#4*(14)] |
| str @t[3], [sp,#4*(16+15)] |
| mov @t[3],#10 |
| b .Loop |
| |
| .align 4 |
| .Loop: |
| subs @t[3],@t[3],#1 |
| ___ |
| foreach (&ROUND(0, 4, 8,12)) { eval; } |
| foreach (&ROUND(0, 5,10,15)) { eval; } |
| $code.=<<___; |
| bne .Loop |
| |
| ldr @t[3],[sp,#4*(32+2)] @ load len |
| |
| str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store |
| str @t[1], [sp,#4*(16+9)] |
| str @x[12],[sp,#4*(16+12)] |
| str @t[2], [sp,#4*(16+13)] |
| str @x[14],[sp,#4*(16+14)] |
| |
| @ at this point we have first half of 512-bit result in |
| @ @x[0-7] and second half at sp+4*(16+8) |
| |
| cmp @t[3],#64 @ done yet? |
| #ifdef __thumb2__ |
| itete lo |
| #endif |
| addlo r12,sp,#4*(0) @ shortcut or ... |
| ldrhs r12,[sp,#4*(32+1)] @ ... load inp |
| addlo r14,sp,#4*(0) @ shortcut or ... |
| ldrhs r14,[sp,#4*(32+0)] @ ... load out |
| |
| ldr @t[0],[sp,#4*(0)] @ load key material |
| ldr @t[1],[sp,#4*(1)] |
| |
| #if __ARM_ARCH__>=6 || !defined(__ARMEB__) |
| # if __ARM_ARCH__<7 |
| orr @t[2],r12,r14 |
| tst @t[2],#3 @ are input and output aligned? |
| ldr @t[2],[sp,#4*(2)] |
| bne .Lunaligned |
| cmp @t[3],#64 @ restore flags |
| # else |
| ldr @t[2],[sp,#4*(2)] |
| # endif |
| ldr @t[3],[sp,#4*(3)] |
| |
| add @x[0],@x[0],@t[0] @ accumulate key material |
| add @x[1],@x[1],@t[1] |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| ldrhs @t[0],[r12],#16 @ load input |
| ldrhs @t[1],[r12,#-12] |
| |
| add @x[2],@x[2],@t[2] |
| add @x[3],@x[3],@t[3] |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| ldrhs @t[2],[r12,#-8] |
| ldrhs @t[3],[r12,#-4] |
| # if __ARM_ARCH__>=6 && defined(__ARMEB__) |
| rev @x[0],@x[0] |
| rev @x[1],@x[1] |
| rev @x[2],@x[2] |
| rev @x[3],@x[3] |
| # endif |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| eorhs @x[0],@x[0],@t[0] @ xor with input |
| eorhs @x[1],@x[1],@t[1] |
| add @t[0],sp,#4*(4) |
| str @x[0],[r14],#16 @ store output |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| eorhs @x[2],@x[2],@t[2] |
| eorhs @x[3],@x[3],@t[3] |
| ldmia @t[0],{@t[0]-@t[3]} @ load key material |
| str @x[1],[r14,#-12] |
| str @x[2],[r14,#-8] |
| str @x[3],[r14,#-4] |
| |
| add @x[4],@x[4],@t[0] @ accumulate key material |
| add @x[5],@x[5],@t[1] |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| ldrhs @t[0],[r12],#16 @ load input |
| ldrhs @t[1],[r12,#-12] |
| add @x[6],@x[6],@t[2] |
| add @x[7],@x[7],@t[3] |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| ldrhs @t[2],[r12,#-8] |
| ldrhs @t[3],[r12,#-4] |
| # if __ARM_ARCH__>=6 && defined(__ARMEB__) |
| rev @x[4],@x[4] |
| rev @x[5],@x[5] |
| rev @x[6],@x[6] |
| rev @x[7],@x[7] |
| # endif |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| eorhs @x[4],@x[4],@t[0] |
| eorhs @x[5],@x[5],@t[1] |
| add @t[0],sp,#4*(8) |
| str @x[4],[r14],#16 @ store output |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| eorhs @x[6],@x[6],@t[2] |
| eorhs @x[7],@x[7],@t[3] |
| str @x[5],[r14,#-12] |
| ldmia @t[0],{@t[0]-@t[3]} @ load key material |
| str @x[6],[r14,#-8] |
| add @x[0],sp,#4*(16+8) |
| str @x[7],[r14,#-4] |
| |
| ldmia @x[0],{@x[0]-@x[7]} @ load second half |
| |
| add @x[0],@x[0],@t[0] @ accumulate key material |
| add @x[1],@x[1],@t[1] |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| ldrhs @t[0],[r12],#16 @ load input |
| ldrhs @t[1],[r12,#-12] |
| # ifdef __thumb2__ |
| itt hi |
| # endif |
| strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it |
| strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it |
| add @x[2],@x[2],@t[2] |
| add @x[3],@x[3],@t[3] |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| ldrhs @t[2],[r12,#-8] |
| ldrhs @t[3],[r12,#-4] |
| # if __ARM_ARCH__>=6 && defined(__ARMEB__) |
| rev @x[0],@x[0] |
| rev @x[1],@x[1] |
| rev @x[2],@x[2] |
| rev @x[3],@x[3] |
| # endif |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| eorhs @x[0],@x[0],@t[0] |
| eorhs @x[1],@x[1],@t[1] |
| add @t[0],sp,#4*(12) |
| str @x[0],[r14],#16 @ store output |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| eorhs @x[2],@x[2],@t[2] |
| eorhs @x[3],@x[3],@t[3] |
| str @x[1],[r14,#-12] |
| ldmia @t[0],{@t[0]-@t[3]} @ load key material |
| str @x[2],[r14,#-8] |
| str @x[3],[r14,#-4] |
| |
| add @x[4],@x[4],@t[0] @ accumulate key material |
| add @x[5],@x[5],@t[1] |
| # ifdef __thumb2__ |
| itt hi |
| # endif |
| addhi @t[0],@t[0],#1 @ next counter value |
| strhi @t[0],[sp,#4*(12)] @ save next counter value |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| ldrhs @t[0],[r12],#16 @ load input |
| ldrhs @t[1],[r12,#-12] |
| add @x[6],@x[6],@t[2] |
| add @x[7],@x[7],@t[3] |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| ldrhs @t[2],[r12,#-8] |
| ldrhs @t[3],[r12,#-4] |
| # if __ARM_ARCH__>=6 && defined(__ARMEB__) |
| rev @x[4],@x[4] |
| rev @x[5],@x[5] |
| rev @x[6],@x[6] |
| rev @x[7],@x[7] |
| # endif |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| eorhs @x[4],@x[4],@t[0] |
| eorhs @x[5],@x[5],@t[1] |
| # ifdef __thumb2__ |
| it ne |
| # endif |
| ldrne @t[0],[sp,#4*(32+2)] @ re-load len |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| eorhs @x[6],@x[6],@t[2] |
| eorhs @x[7],@x[7],@t[3] |
| str @x[4],[r14],#16 @ store output |
| str @x[5],[r14,#-12] |
| # ifdef __thumb2__ |
| it hs |
| # endif |
| subhs @t[3],@t[0],#64 @ len-=64 |
| str @x[6],[r14,#-8] |
| str @x[7],[r14,#-4] |
| bhi .Loop_outer |
| |
| beq .Ldone |
| # if __ARM_ARCH__<7 |
| b .Ltail |
| |
| .align 4 |
| .Lunaligned: @ unaligned endian-neutral path |
| cmp @t[3],#64 @ restore flags |
| # endif |
| #endif |
| #if __ARM_ARCH__<7 |
| ldr @t[3],[sp,#4*(3)] |
| ___ |
| for ($i=0;$i<16;$i+=4) { |
| my $j=$i&0x7; |
| |
| $code.=<<___ if ($i==4); |
| add @x[0],sp,#4*(16+8) |
| ___ |
| $code.=<<___ if ($i==8); |
| ldmia @x[0],{@x[0]-@x[7]} @ load second half |
| # ifdef __thumb2__ |
| itt hi |
| # endif |
| strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" |
| strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" |
| ___ |
| $code.=<<___; |
| add @x[$j+0],@x[$j+0],@t[0] @ accumulate key material |
| ___ |
| $code.=<<___ if ($i==12); |
| # ifdef __thumb2__ |
| itt hi |
| # endif |
| addhi @t[0],@t[0],#1 @ next counter value |
| strhi @t[0],[sp,#4*(12)] @ save next counter value |
| ___ |
| $code.=<<___; |
| add @x[$j+1],@x[$j+1],@t[1] |
| add @x[$j+2],@x[$j+2],@t[2] |
| # ifdef __thumb2__ |
| itete lo |
| # endif |
| eorlo @t[0],@t[0],@t[0] @ zero or ... |
| ldrhsb @t[0],[r12],#16 @ ... load input |
| eorlo @t[1],@t[1],@t[1] |
| ldrhsb @t[1],[r12,#-12] |
| |
| add @x[$j+3],@x[$j+3],@t[3] |
| # ifdef __thumb2__ |
| itete lo |
| # endif |
| eorlo @t[2],@t[2],@t[2] |
| ldrhsb @t[2],[r12,#-8] |
| eorlo @t[3],@t[3],@t[3] |
| ldrhsb @t[3],[r12,#-4] |
| |
| eor @x[$j+0],@t[0],@x[$j+0] @ xor with input (or zero) |
| eor @x[$j+1],@t[1],@x[$j+1] |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| ldrhsb @t[0],[r12,#-15] @ load more input |
| ldrhsb @t[1],[r12,#-11] |
| eor @x[$j+2],@t[2],@x[$j+2] |
| strb @x[$j+0],[r14],#16 @ store output |
| eor @x[$j+3],@t[3],@x[$j+3] |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| ldrhsb @t[2],[r12,#-7] |
| ldrhsb @t[3],[r12,#-3] |
| strb @x[$j+1],[r14,#-12] |
| eor @x[$j+0],@t[0],@x[$j+0],lsr#8 |
| strb @x[$j+2],[r14,#-8] |
| eor @x[$j+1],@t[1],@x[$j+1],lsr#8 |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| ldrhsb @t[0],[r12,#-14] @ load more input |
| ldrhsb @t[1],[r12,#-10] |
| strb @x[$j+3],[r14,#-4] |
| eor @x[$j+2],@t[2],@x[$j+2],lsr#8 |
| strb @x[$j+0],[r14,#-15] |
| eor @x[$j+3],@t[3],@x[$j+3],lsr#8 |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| ldrhsb @t[2],[r12,#-6] |
| ldrhsb @t[3],[r12,#-2] |
| strb @x[$j+1],[r14,#-11] |
| eor @x[$j+0],@t[0],@x[$j+0],lsr#8 |
| strb @x[$j+2],[r14,#-7] |
| eor @x[$j+1],@t[1],@x[$j+1],lsr#8 |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| ldrhsb @t[0],[r12,#-13] @ load more input |
| ldrhsb @t[1],[r12,#-9] |
| strb @x[$j+3],[r14,#-3] |
| eor @x[$j+2],@t[2],@x[$j+2],lsr#8 |
| strb @x[$j+0],[r14,#-14] |
| eor @x[$j+3],@t[3],@x[$j+3],lsr#8 |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| ldrhsb @t[2],[r12,#-5] |
| ldrhsb @t[3],[r12,#-1] |
| strb @x[$j+1],[r14,#-10] |
| strb @x[$j+2],[r14,#-6] |
| eor @x[$j+0],@t[0],@x[$j+0],lsr#8 |
| strb @x[$j+3],[r14,#-2] |
| eor @x[$j+1],@t[1],@x[$j+1],lsr#8 |
| strb @x[$j+0],[r14,#-13] |
| eor @x[$j+2],@t[2],@x[$j+2],lsr#8 |
| strb @x[$j+1],[r14,#-9] |
| eor @x[$j+3],@t[3],@x[$j+3],lsr#8 |
| strb @x[$j+2],[r14,#-5] |
| strb @x[$j+3],[r14,#-1] |
| ___ |
| $code.=<<___ if ($i<12); |
| add @t[0],sp,#4*(4+$i) |
| ldmia @t[0],{@t[0]-@t[3]} @ load key material |
| ___ |
| } |
| $code.=<<___; |
| # ifdef __thumb2__ |
| it ne |
| # endif |
| ldrne @t[0],[sp,#4*(32+2)] @ re-load len |
| # ifdef __thumb2__ |
| it hs |
| # endif |
| subhs @t[3],@t[0],#64 @ len-=64 |
| bhi .Loop_outer |
| |
| beq .Ldone |
| #endif |
| |
| .Ltail: |
| ldr r12,[sp,#4*(32+1)] @ load inp |
| add @t[1],sp,#4*(0) |
| ldr r14,[sp,#4*(32+0)] @ load out |
| |
| .Loop_tail: |
| ldrb @t[2],[@t[1]],#1 @ read buffer on stack |
| ldrb @t[3],[r12],#1 @ read input |
| subs @t[0],@t[0],#1 |
| eor @t[3],@t[3],@t[2] |
| strb @t[3],[r14],#1 @ store output |
| bne .Loop_tail |
| |
| .Ldone: |
| add sp,sp,#4*(32+3) |
| .Lno_data: |
| ldmia sp!,{r4-r11,pc} |
| .size ChaCha20_ctr32,.-ChaCha20_ctr32 |
| ___ |
| |
| {{{ |
| my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) = |
| map("q$_",(0..15)); |
| |
| sub NEONROUND { |
| my $odd = pop; |
| my ($a,$b,$c,$d,$t)=@_; |
| |
| ( |
| "&vadd_i32 ($a,$a,$b)", |
| "&veor ($d,$d,$a)", |
| "&vrev32_16 ($d,$d)", # vrot ($d,16) |
| |
| "&vadd_i32 ($c,$c,$d)", |
| "&veor ($t,$b,$c)", |
| "&vshr_u32 ($b,$t,20)", |
| "&vsli_32 ($b,$t,12)", |
| |
| "&vadd_i32 ($a,$a,$b)", |
| "&veor ($t,$d,$a)", |
| "&vshr_u32 ($d,$t,24)", |
| "&vsli_32 ($d,$t,8)", |
| |
| "&vadd_i32 ($c,$c,$d)", |
| "&veor ($t,$b,$c)", |
| "&vshr_u32 ($b,$t,25)", |
| "&vsli_32 ($b,$t,7)", |
| |
| "&vext_8 ($c,$c,$c,8)", |
| "&vext_8 ($b,$b,$b,$odd?12:4)", |
| "&vext_8 ($d,$d,$d,$odd?4:12)" |
| ); |
| } |
| |
| $code.=<<___; |
| #if __ARM_MAX_ARCH__>=7 |
| .arch armv7-a |
| .fpu neon |
| |
| .type ChaCha20_neon,%function |
| .align 5 |
| ChaCha20_neon: |
| ldr r12,[sp,#0] @ pull pointer to counter and nonce |
| stmdb sp!,{r0-r2,r4-r11,lr} |
| .LChaCha20_neon: |
| adr r14,.Lsigma |
| vstmdb sp!,{d8-d15} @ ABI spec says so |
| stmdb sp!,{r0-r3} |
| |
| vld1.32 {$b0-$c0},[r3] @ load key |
| ldmia r3,{r4-r11} @ load key |
| |
| sub sp,sp,#4*(16+16) |
| vld1.32 {$d0},[r12] @ load counter and nonce |
| add r12,sp,#4*8 |
| ldmia r14,{r0-r3} @ load sigma |
| vld1.32 {$a0},[r14]! @ load sigma |
| vld1.32 {$t0},[r14] @ one |
| vst1.32 {$c0-$d0},[r12] @ copy 1/2key|counter|nonce |
| vst1.32 {$a0-$b0},[sp] @ copy sigma|1/2key |
| |
| str r10,[sp,#4*(16+10)] @ off-load "@x[10]" |
| str r11,[sp,#4*(16+11)] @ off-load "@x[11]" |
| vshl.i32 $t1#lo,$t0#lo,#1 @ two |
| vstr $t0#lo,[sp,#4*(16+0)] |
| vshl.i32 $t2#lo,$t0#lo,#2 @ four |
| vstr $t1#lo,[sp,#4*(16+2)] |
| vmov $a1,$a0 |
| vstr $t2#lo,[sp,#4*(16+4)] |
| vmov $a2,$a0 |
| vmov $b1,$b0 |
| vmov $b2,$b0 |
| b .Loop_neon_enter |
| |
| .align 4 |
| .Loop_neon_outer: |
| ldmia sp,{r0-r9} @ load key material |
| cmp @t[3],#64*2 @ if len<=64*2 |
| bls .Lbreak_neon @ switch to integer-only |
| vmov $a1,$a0 |
| str @t[3],[sp,#4*(32+2)] @ save len |
| vmov $a2,$a0 |
| str r12, [sp,#4*(32+1)] @ save inp |
| vmov $b1,$b0 |
| str r14, [sp,#4*(32+0)] @ save out |
| vmov $b2,$b0 |
| .Loop_neon_enter: |
| ldr @t[3], [sp,#4*(15)] |
| vadd.i32 $d1,$d0,$t0 @ counter+1 |
| ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load |
| vmov $c1,$c0 |
| ldr @t[2], [sp,#4*(13)] |
| vmov $c2,$c0 |
| ldr @x[14],[sp,#4*(14)] |
| vadd.i32 $d2,$d1,$t0 @ counter+2 |
| str @t[3], [sp,#4*(16+15)] |
| mov @t[3],#10 |
| add @x[12],@x[12],#3 @ counter+3 |
| b .Loop_neon |
| |
| .align 4 |
| .Loop_neon: |
| subs @t[3],@t[3],#1 |
| ___ |
| my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0); |
| my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0); |
| my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0); |
| my @thread3=&ROUND(0,4,8,12); |
| |
| foreach (@thread0) { |
| eval; eval(shift(@thread3)); |
| eval(shift(@thread1)); eval(shift(@thread3)); |
| eval(shift(@thread2)); eval(shift(@thread3)); |
| } |
| |
| @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1); |
| @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1); |
| @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1); |
| @thread3=&ROUND(0,5,10,15); |
| |
| foreach (@thread0) { |
| eval; eval(shift(@thread3)); |
| eval(shift(@thread1)); eval(shift(@thread3)); |
| eval(shift(@thread2)); eval(shift(@thread3)); |
| } |
| $code.=<<___; |
| bne .Loop_neon |
| |
| add @t[3],sp,#32 |
| vld1.32 {$t0-$t1},[sp] @ load key material |
| vld1.32 {$t2-$t3},[@t[3]] |
| |
| ldr @t[3],[sp,#4*(32+2)] @ load len |
| |
| str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store |
| str @t[1], [sp,#4*(16+9)] |
| str @x[12],[sp,#4*(16+12)] |
| str @t[2], [sp,#4*(16+13)] |
| str @x[14],[sp,#4*(16+14)] |
| |
| @ at this point we have first half of 512-bit result in |
| @ @x[0-7] and second half at sp+4*(16+8) |
| |
| ldr r12,[sp,#4*(32+1)] @ load inp |
| ldr r14,[sp,#4*(32+0)] @ load out |
| |
| vadd.i32 $a0,$a0,$t0 @ accumulate key material |
| vadd.i32 $a1,$a1,$t0 |
| vadd.i32 $a2,$a2,$t0 |
| vldr $t0#lo,[sp,#4*(16+0)] @ one |
| |
| vadd.i32 $b0,$b0,$t1 |
| vadd.i32 $b1,$b1,$t1 |
| vadd.i32 $b2,$b2,$t1 |
| vldr $t1#lo,[sp,#4*(16+2)] @ two |
| |
| vadd.i32 $c0,$c0,$t2 |
| vadd.i32 $c1,$c1,$t2 |
| vadd.i32 $c2,$c2,$t2 |
| vadd.i32 $d1#lo,$d1#lo,$t0#lo @ counter+1 |
| vadd.i32 $d2#lo,$d2#lo,$t1#lo @ counter+2 |
| |
| vadd.i32 $d0,$d0,$t3 |
| vadd.i32 $d1,$d1,$t3 |
| vadd.i32 $d2,$d2,$t3 |
| |
| cmp @t[3],#64*4 |
| blo .Ltail_neon |
| |
| vld1.8 {$t0-$t1},[r12]! @ load input |
| mov @t[3],sp |
| vld1.8 {$t2-$t3},[r12]! |
| veor $a0,$a0,$t0 @ xor with input |
| veor $b0,$b0,$t1 |
| vld1.8 {$t0-$t1},[r12]! |
| veor $c0,$c0,$t2 |
| veor $d0,$d0,$t3 |
| vld1.8 {$t2-$t3},[r12]! |
| |
| veor $a1,$a1,$t0 |
| vst1.8 {$a0-$b0},[r14]! @ store output |
| veor $b1,$b1,$t1 |
| vld1.8 {$t0-$t1},[r12]! |
| veor $c1,$c1,$t2 |
| vst1.8 {$c0-$d0},[r14]! |
| veor $d1,$d1,$t3 |
| vld1.8 {$t2-$t3},[r12]! |
| |
| veor $a2,$a2,$t0 |
| vld1.32 {$a0-$b0},[@t[3]]! @ load for next iteration |
| veor $t0#hi,$t0#hi,$t0#hi |
| vldr $t0#lo,[sp,#4*(16+4)] @ four |
| veor $b2,$b2,$t1 |
| vld1.32 {$c0-$d0},[@t[3]] |
| veor $c2,$c2,$t2 |
| vst1.8 {$a1-$b1},[r14]! |
| veor $d2,$d2,$t3 |
| vst1.8 {$c1-$d1},[r14]! |
| |
| vadd.i32 $d0#lo,$d0#lo,$t0#lo @ next counter value |
| vldr $t0#lo,[sp,#4*(16+0)] @ one |
| |
| ldmia sp,{@t[0]-@t[3]} @ load key material |
| add @x[0],@x[0],@t[0] @ accumulate key material |
| ldr @t[0],[r12],#16 @ load input |
| vst1.8 {$a2-$b2},[r14]! |
| add @x[1],@x[1],@t[1] |
| ldr @t[1],[r12,#-12] |
| vst1.8 {$c2-$d2},[r14]! |
| add @x[2],@x[2],@t[2] |
| ldr @t[2],[r12,#-8] |
| add @x[3],@x[3],@t[3] |
| ldr @t[3],[r12,#-4] |
| # ifdef __ARMEB__ |
| rev @x[0],@x[0] |
| rev @x[1],@x[1] |
| rev @x[2],@x[2] |
| rev @x[3],@x[3] |
| # endif |
| eor @x[0],@x[0],@t[0] @ xor with input |
| add @t[0],sp,#4*(4) |
| eor @x[1],@x[1],@t[1] |
| str @x[0],[r14],#16 @ store output |
| eor @x[2],@x[2],@t[2] |
| str @x[1],[r14,#-12] |
| eor @x[3],@x[3],@t[3] |
| ldmia @t[0],{@t[0]-@t[3]} @ load key material |
| str @x[2],[r14,#-8] |
| str @x[3],[r14,#-4] |
| |
| add @x[4],@x[4],@t[0] @ accumulate key material |
| ldr @t[0],[r12],#16 @ load input |
| add @x[5],@x[5],@t[1] |
| ldr @t[1],[r12,#-12] |
| add @x[6],@x[6],@t[2] |
| ldr @t[2],[r12,#-8] |
| add @x[7],@x[7],@t[3] |
| ldr @t[3],[r12,#-4] |
| # ifdef __ARMEB__ |
| rev @x[4],@x[4] |
| rev @x[5],@x[5] |
| rev @x[6],@x[6] |
| rev @x[7],@x[7] |
| # endif |
| eor @x[4],@x[4],@t[0] |
| add @t[0],sp,#4*(8) |
| eor @x[5],@x[5],@t[1] |
| str @x[4],[r14],#16 @ store output |
| eor @x[6],@x[6],@t[2] |
| str @x[5],[r14,#-12] |
| eor @x[7],@x[7],@t[3] |
| ldmia @t[0],{@t[0]-@t[3]} @ load key material |
| str @x[6],[r14,#-8] |
| add @x[0],sp,#4*(16+8) |
| str @x[7],[r14,#-4] |
| |
| ldmia @x[0],{@x[0]-@x[7]} @ load second half |
| |
| add @x[0],@x[0],@t[0] @ accumulate key material |
| ldr @t[0],[r12],#16 @ load input |
| add @x[1],@x[1],@t[1] |
| ldr @t[1],[r12,#-12] |
| # ifdef __thumb2__ |
| it hi |
| # endif |
| strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it |
| add @x[2],@x[2],@t[2] |
| ldr @t[2],[r12,#-8] |
| # ifdef __thumb2__ |
| it hi |
| # endif |
| strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it |
| add @x[3],@x[3],@t[3] |
| ldr @t[3],[r12,#-4] |
| # ifdef __ARMEB__ |
| rev @x[0],@x[0] |
| rev @x[1],@x[1] |
| rev @x[2],@x[2] |
| rev @x[3],@x[3] |
| # endif |
| eor @x[0],@x[0],@t[0] |
| add @t[0],sp,#4*(12) |
| eor @x[1],@x[1],@t[1] |
| str @x[0],[r14],#16 @ store output |
| eor @x[2],@x[2],@t[2] |
| str @x[1],[r14,#-12] |
| eor @x[3],@x[3],@t[3] |
| ldmia @t[0],{@t[0]-@t[3]} @ load key material |
| str @x[2],[r14,#-8] |
| str @x[3],[r14,#-4] |
| |
| add @x[4],@x[4],@t[0] @ accumulate key material |
| add @t[0],@t[0],#4 @ next counter value |
| add @x[5],@x[5],@t[1] |
| str @t[0],[sp,#4*(12)] @ save next counter value |
| ldr @t[0],[r12],#16 @ load input |
| add @x[6],@x[6],@t[2] |
| add @x[4],@x[4],#3 @ counter+3 |
| ldr @t[1],[r12,#-12] |
| add @x[7],@x[7],@t[3] |
| ldr @t[2],[r12,#-8] |
| ldr @t[3],[r12,#-4] |
| # ifdef __ARMEB__ |
| rev @x[4],@x[4] |
| rev @x[5],@x[5] |
| rev @x[6],@x[6] |
| rev @x[7],@x[7] |
| # endif |
| eor @x[4],@x[4],@t[0] |
| # ifdef __thumb2__ |
| it hi |
| # endif |
| ldrhi @t[0],[sp,#4*(32+2)] @ re-load len |
| eor @x[5],@x[5],@t[1] |
| eor @x[6],@x[6],@t[2] |
| str @x[4],[r14],#16 @ store output |
| eor @x[7],@x[7],@t[3] |
| str @x[5],[r14,#-12] |
| sub @t[3],@t[0],#64*4 @ len-=64*4 |
| str @x[6],[r14,#-8] |
| str @x[7],[r14,#-4] |
| bhi .Loop_neon_outer |
| |
| b .Ldone_neon |
| |
| .align 4 |
| .Lbreak_neon: |
| @ harmonize NEON and integer-only stack frames: load data |
| @ from NEON frame, but save to integer-only one; distance |
| @ between the two is 4*(32+4+16-32)=4*(20). |
| |
| str @t[3], [sp,#4*(20+32+2)] @ save len |
| add @t[3],sp,#4*(32+4) |
| str r12, [sp,#4*(20+32+1)] @ save inp |
| str r14, [sp,#4*(20+32+0)] @ save out |
| |
| ldr @x[12],[sp,#4*(16+10)] |
| ldr @x[14],[sp,#4*(16+11)] |
| vldmia @t[3],{d8-d15} @ fulfill ABI requirement |
| str @x[12],[sp,#4*(20+16+10)] @ copy "@x[10]" |
| str @x[14],[sp,#4*(20+16+11)] @ copy "@x[11]" |
| |
| ldr @t[3], [sp,#4*(15)] |
| ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load |
| ldr @t[2], [sp,#4*(13)] |
| ldr @x[14],[sp,#4*(14)] |
| str @t[3], [sp,#4*(20+16+15)] |
| add @t[3],sp,#4*(20) |
| vst1.32 {$a0-$b0},[@t[3]]! @ copy key |
| add sp,sp,#4*(20) @ switch frame |
| vst1.32 {$c0-$d0},[@t[3]] |
| mov @t[3],#10 |
| b .Loop @ go integer-only |
| |
| .align 4 |
| .Ltail_neon: |
| cmp @t[3],#64*3 |
| bhs .L192_or_more_neon |
| cmp @t[3],#64*2 |
| bhs .L128_or_more_neon |
| cmp @t[3],#64*1 |
| bhs .L64_or_more_neon |
| |
| add @t[0],sp,#4*(8) |
| vst1.8 {$a0-$b0},[sp] |
| add @t[2],sp,#4*(0) |
| vst1.8 {$c0-$d0},[@t[0]] |
| b .Loop_tail_neon |
| |
| .align 4 |
| .L64_or_more_neon: |
| vld1.8 {$t0-$t1},[r12]! |
| vld1.8 {$t2-$t3},[r12]! |
| veor $a0,$a0,$t0 |
| veor $b0,$b0,$t1 |
| veor $c0,$c0,$t2 |
| veor $d0,$d0,$t3 |
| vst1.8 {$a0-$b0},[r14]! |
| vst1.8 {$c0-$d0},[r14]! |
| |
| beq .Ldone_neon |
| |
| add @t[0],sp,#4*(8) |
| vst1.8 {$a1-$b1},[sp] |
| add @t[2],sp,#4*(0) |
| vst1.8 {$c1-$d1},[@t[0]] |
| sub @t[3],@t[3],#64*1 @ len-=64*1 |
| b .Loop_tail_neon |
| |
| .align 4 |
| .L128_or_more_neon: |
| vld1.8 {$t0-$t1},[r12]! |
| vld1.8 {$t2-$t3},[r12]! |
| veor $a0,$a0,$t0 |
| veor $b0,$b0,$t1 |
| vld1.8 {$t0-$t1},[r12]! |
| veor $c0,$c0,$t2 |
| veor $d0,$d0,$t3 |
| vld1.8 {$t2-$t3},[r12]! |
| |
| veor $a1,$a1,$t0 |
| veor $b1,$b1,$t1 |
| vst1.8 {$a0-$b0},[r14]! |
| veor $c1,$c1,$t2 |
| vst1.8 {$c0-$d0},[r14]! |
| veor $d1,$d1,$t3 |
| vst1.8 {$a1-$b1},[r14]! |
| vst1.8 {$c1-$d1},[r14]! |
| |
| beq .Ldone_neon |
| |
| add @t[0],sp,#4*(8) |
| vst1.8 {$a2-$b2},[sp] |
| add @t[2],sp,#4*(0) |
| vst1.8 {$c2-$d2},[@t[0]] |
| sub @t[3],@t[3],#64*2 @ len-=64*2 |
| b .Loop_tail_neon |
| |
| .align 4 |
| .L192_or_more_neon: |
| vld1.8 {$t0-$t1},[r12]! |
| vld1.8 {$t2-$t3},[r12]! |
| veor $a0,$a0,$t0 |
| veor $b0,$b0,$t1 |
| vld1.8 {$t0-$t1},[r12]! |
| veor $c0,$c0,$t2 |
| veor $d0,$d0,$t3 |
| vld1.8 {$t2-$t3},[r12]! |
| |
| veor $a1,$a1,$t0 |
| veor $b1,$b1,$t1 |
| vld1.8 {$t0-$t1},[r12]! |
| veor $c1,$c1,$t2 |
| vst1.8 {$a0-$b0},[r14]! |
| veor $d1,$d1,$t3 |
| vld1.8 {$t2-$t3},[r12]! |
| |
| veor $a2,$a2,$t0 |
| vst1.8 {$c0-$d0},[r14]! |
| veor $b2,$b2,$t1 |
| vst1.8 {$a1-$b1},[r14]! |
| veor $c2,$c2,$t2 |
| vst1.8 {$c1-$d1},[r14]! |
| veor $d2,$d2,$t3 |
| vst1.8 {$a2-$b2},[r14]! |
| vst1.8 {$c2-$d2},[r14]! |
| |
| beq .Ldone_neon |
| |
| ldmia sp,{@t[0]-@t[3]} @ load key material |
| add @x[0],@x[0],@t[0] @ accumulate key material |
| add @t[0],sp,#4*(4) |
| add @x[1],@x[1],@t[1] |
| add @x[2],@x[2],@t[2] |
| add @x[3],@x[3],@t[3] |
| ldmia @t[0],{@t[0]-@t[3]} @ load key material |
| |
| add @x[4],@x[4],@t[0] @ accumulate key material |
| add @t[0],sp,#4*(8) |
| add @x[5],@x[5],@t[1] |
| add @x[6],@x[6],@t[2] |
| add @x[7],@x[7],@t[3] |
| ldmia @t[0],{@t[0]-@t[3]} @ load key material |
| # ifdef __ARMEB__ |
| rev @x[0],@x[0] |
| rev @x[1],@x[1] |
| rev @x[2],@x[2] |
| rev @x[3],@x[3] |
| rev @x[4],@x[4] |
| rev @x[5],@x[5] |
| rev @x[6],@x[6] |
| rev @x[7],@x[7] |
| # endif |
| stmia sp,{@x[0]-@x[7]} |
| add @x[0],sp,#4*(16+8) |
| |
| ldmia @x[0],{@x[0]-@x[7]} @ load second half |
| |
| add @x[0],@x[0],@t[0] @ accumulate key material |
| add @t[0],sp,#4*(12) |
| add @x[1],@x[1],@t[1] |
| add @x[2],@x[2],@t[2] |
| add @x[3],@x[3],@t[3] |
| ldmia @t[0],{@t[0]-@t[3]} @ load key material |
| |
| add @x[4],@x[4],@t[0] @ accumulate key material |
| add @t[0],sp,#4*(8) |
| add @x[5],@x[5],@t[1] |
| add @x[4],@x[4],#3 @ counter+3 |
| add @x[6],@x[6],@t[2] |
| add @x[7],@x[7],@t[3] |
| ldr @t[3],[sp,#4*(32+2)] @ re-load len |
| # ifdef __ARMEB__ |
| rev @x[0],@x[0] |
| rev @x[1],@x[1] |
| rev @x[2],@x[2] |
| rev @x[3],@x[3] |
| rev @x[4],@x[4] |
| rev @x[5],@x[5] |
| rev @x[6],@x[6] |
| rev @x[7],@x[7] |
| # endif |
| stmia @t[0],{@x[0]-@x[7]} |
| add @t[2],sp,#4*(0) |
| sub @t[3],@t[3],#64*3 @ len-=64*3 |
| |
| .Loop_tail_neon: |
| ldrb @t[0],[@t[2]],#1 @ read buffer on stack |
| ldrb @t[1],[r12],#1 @ read input |
| subs @t[3],@t[3],#1 |
| eor @t[0],@t[0],@t[1] |
| strb @t[0],[r14],#1 @ store output |
| bne .Loop_tail_neon |
| |
| .Ldone_neon: |
| add sp,sp,#4*(32+4) |
| vldmia sp,{d8-d15} |
| add sp,sp,#4*(16+3) |
| ldmia sp!,{r4-r11,pc} |
| .size ChaCha20_neon,.-ChaCha20_neon |
| .comm OPENSSL_armcap_P,4,4 |
| #endif |
| ___ |
| }}} |
| |
| foreach (split("\n",$code)) { |
| s/\`([^\`]*)\`/eval $1/geo; |
| |
| s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; |
| |
| print $_,"\n"; |
| } |
| close STDOUT or die "error closing STDOUT"; |