|  | #! /usr/bin/env perl | 
|  | # Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. | 
|  | # | 
|  | # Licensed under the OpenSSL license (the "License").  You may not use | 
|  | # this file except in compliance with the License.  You can obtain a copy | 
|  | # in the file LICENSE in the source distribution or at | 
|  | # https://www.openssl.org/source/license.html | 
|  |  | 
|  | # ==================================================================== | 
|  | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | 
|  | # project. The module is, however, dual licensed under OpenSSL and | 
|  | # CRYPTOGAMS licenses depending on where you obtain it. For further | 
|  | # details see http://www.openssl.org/~appro/cryptogams/. | 
|  | # ==================================================================== | 
|  |  | 
|  | # This file was adapted to AArch64 from the 32-bit version in ghash-armv4.pl. It | 
|  | # implements the multiplication algorithm described in: | 
|  | # | 
|  | # Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software | 
|  | # Polynomial Multiplication on ARM Processors using the NEON Engine. | 
|  | # | 
|  | # http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf | 
|  | # | 
|  | # The main distinction to keep in mind between 32-bit NEON and AArch64 SIMD is | 
|  | # AArch64 cannot compute over the upper halves of SIMD registers. In 32-bit | 
|  | # NEON, the low and high halves of the 128-bit register q0 are accessible as | 
|  | # 64-bit registers d0 and d1, respectively. In AArch64, dN is the lower half of | 
|  | # vN. Where the 32-bit version would use the upper half, this file must keep | 
|  | # halves in separate registers. | 
|  | # | 
|  | # The other distinction is in syntax. 32-bit NEON embeds lane information in the | 
|  | # instruction name, while AArch64 uses suffixes on the registers. For instance, | 
|  | # left-shifting 64-bit lanes of a SIMD register in 32-bit would be written: | 
|  | # | 
|  | #     vshl.i64 q0, q0, #1 | 
|  | # | 
|  | # in 64-bit, it would be written: | 
|  | # | 
|  | #     shl v0.2d, v0.2d, #1 | 
|  | # | 
|  | # See Programmer's Guide for ARMv8-A, section 7 for details. | 
|  | # http://infocenter.arm.com/help/topic/com.arm.doc.den0024a/DEN0024A_v8_architecture_PG.pdf | 
|  | # | 
|  | # Finally, note the 8-bit and 64-bit polynomial multipliers in AArch64 differ | 
|  | # only by suffix. pmull vR.8h, vA.8b, vB.8b multiplies eight 8-bit polynomials | 
|  | # and is always available. pmull vR.1q, vA.1d, vB.1d multiplies a 64-bit | 
|  | # polynomial and is conditioned on the PMULL extension. This file emulates the | 
|  | # latter with the former. | 
|  |  | 
|  | use strict; | 
|  |  | 
|  | my $flavour = shift; | 
|  | my $output; | 
|  | if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } | 
|  | else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } | 
|  |  | 
|  | if ($flavour && $flavour ne "void") { | 
|  | $0 =~ m/(.*[\/\\])[^\/\\]+$/; | 
|  | my $dir = $1; | 
|  | my $xlate; | 
|  | ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or | 
|  | ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or | 
|  | die "can't locate arm-xlate.pl"; | 
|  |  | 
|  | open OUT,"| \"$^X\" $xlate $flavour $output"; | 
|  | *STDOUT=*OUT; | 
|  | } else { | 
|  | open OUT,">$output"; | 
|  | *STDOUT=*OUT; | 
|  | } | 
|  |  | 
|  | my ($Xi, $Htbl, $inp, $len) = map("x$_", (0..3));	# argument block | 
|  | my ($Xl, $Xm, $Xh, $INlo, $INhi) = map("v$_", (0..4)); | 
|  | my ($Hlo, $Hhi, $Hhl) = map("v$_", (5..7)); | 
|  | # d8-d15 are callee-saved, so avoid v8-v15. AArch64 SIMD has plenty of registers | 
|  | # to spare. | 
|  | my ($t0, $t1, $t2, $t3) = map("v$_", (16..19)); | 
|  | my ($t0l_t1l, $t0h_t1h, $t2l_t3l, $t2h_t3h) = map("v$_", (20..23)); | 
|  | my ($k48_k32, $k16_k0) = map("v$_", (24..25)); | 
|  |  | 
|  | my $code = ""; | 
|  |  | 
|  | # clmul64x64 emits code which emulates pmull $r.1q, $a.1d, $b.1d. $r, $a, and $b | 
|  | # must be distinct from $t* and $k*. $t* are clobbered by the emitted code. | 
|  | sub clmul64x64 { | 
|  | my ($r, $a, $b) = @_; | 
|  | $code .= <<___; | 
|  | ext	$t0.8b, $a.8b, $a.8b, #1	// A1 | 
|  | pmull	$t0.8h, $t0.8b, $b.8b		// F = A1*B | 
|  | ext	$r.8b, $b.8b, $b.8b, #1		// B1 | 
|  | pmull	$r.8h, $a.8b, $r.8b		// E = A*B1 | 
|  | ext	$t1.8b, $a.8b, $a.8b, #2	// A2 | 
|  | pmull	$t1.8h, $t1.8b, $b.8b		// H = A2*B | 
|  | ext	$t3.8b, $b.8b, $b.8b, #2	// B2 | 
|  | pmull	$t3.8h, $a.8b, $t3.8b		// G = A*B2 | 
|  | ext	$t2.8b, $a.8b, $a.8b, #3	// A3 | 
|  | eor	$t0.16b, $t0.16b, $r.16b	// L = E + F | 
|  | pmull	$t2.8h, $t2.8b, $b.8b		// J = A3*B | 
|  | ext	$r.8b, $b.8b, $b.8b, #3		// B3 | 
|  | eor	$t1.16b, $t1.16b, $t3.16b	// M = G + H | 
|  | pmull	$r.8h, $a.8b, $r.8b		// I = A*B3 | 
|  |  | 
|  | // Here we diverge from the 32-bit version. It computes the following | 
|  | // (instructions reordered for clarity): | 
|  | // | 
|  | //     veor	\$t0#lo, \$t0#lo, \$t0#hi	@ t0 = P0 + P1 (L) | 
|  | //     vand	\$t0#hi, \$t0#hi, \$k48 | 
|  | //     veor	\$t0#lo, \$t0#lo, \$t0#hi | 
|  | // | 
|  | //     veor	\$t1#lo, \$t1#lo, \$t1#hi	@ t1 = P2 + P3 (M) | 
|  | //     vand	\$t1#hi, \$t1#hi, \$k32 | 
|  | //     veor	\$t1#lo, \$t1#lo, \$t1#hi | 
|  | // | 
|  | //     veor	\$t2#lo, \$t2#lo, \$t2#hi	@ t2 = P4 + P5 (N) | 
|  | //     vand	\$t2#hi, \$t2#hi, \$k16 | 
|  | //     veor	\$t2#lo, \$t2#lo, \$t2#hi | 
|  | // | 
|  | //     veor	\$t3#lo, \$t3#lo, \$t3#hi	@ t3 = P6 + P7 (K) | 
|  | //     vmov.i64	\$t3#hi, #0 | 
|  | // | 
|  | // \$kN is a mask with the bottom N bits set. AArch64 cannot compute on | 
|  | // upper halves of SIMD registers, so we must split each half into | 
|  | // separate registers. To compensate, we pair computations up and | 
|  | // parallelize. | 
|  |  | 
|  | ext	$t3.8b, $b.8b, $b.8b, #4	// B4 | 
|  | eor	$t2.16b, $t2.16b, $r.16b	// N = I + J | 
|  | pmull	$t3.8h, $a.8b, $t3.8b		// K = A*B4 | 
|  |  | 
|  | // This can probably be scheduled more efficiently. For now, we just | 
|  | // pair up independent instructions. | 
|  | zip1	$t0l_t1l.2d, $t0.2d, $t1.2d | 
|  | zip1	$t2l_t3l.2d, $t2.2d, $t3.2d | 
|  | zip2	$t0h_t1h.2d, $t0.2d, $t1.2d | 
|  | zip2	$t2h_t3h.2d, $t2.2d, $t3.2d | 
|  | eor	$t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b | 
|  | eor	$t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b | 
|  | and	$t0h_t1h.16b, $t0h_t1h.16b, $k48_k32.16b | 
|  | and	$t2h_t3h.16b, $t2h_t3h.16b, $k16_k0.16b | 
|  | eor	$t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b | 
|  | eor	$t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b | 
|  | zip1	$t0.2d, $t0l_t1l.2d, $t0h_t1h.2d | 
|  | zip1	$t2.2d, $t2l_t3l.2d, $t2h_t3h.2d | 
|  | zip2	$t1.2d, $t0l_t1l.2d, $t0h_t1h.2d | 
|  | zip2	$t3.2d, $t2l_t3l.2d, $t2h_t3h.2d | 
|  |  | 
|  | ext	$t0.16b, $t0.16b, $t0.16b, #15	// t0 = t0 << 8 | 
|  | ext	$t1.16b, $t1.16b, $t1.16b, #14	// t1 = t1 << 16 | 
|  | pmull	$r.8h, $a.8b, $b.8b		// D = A*B | 
|  | ext	$t3.16b, $t3.16b, $t3.16b, #12	// t3 = t3 << 32 | 
|  | ext	$t2.16b, $t2.16b, $t2.16b, #13	// t2 = t2 << 24 | 
|  | eor	$t0.16b, $t0.16b, $t1.16b | 
|  | eor	$t2.16b, $t2.16b, $t3.16b | 
|  | eor	$r.16b, $r.16b, $t0.16b | 
|  | eor	$r.16b, $r.16b, $t2.16b | 
|  | ___ | 
|  | } | 
|  |  | 
|  | $code .= <<___; | 
|  | .text | 
|  |  | 
|  | .global	gcm_init_neon | 
|  | .type	gcm_init_neon,%function | 
|  | .align	4 | 
|  | gcm_init_neon: | 
|  | // This function is adapted from gcm_init_v8. xC2 is t3. | 
|  | ld1	{$t1.2d}, [x1]			// load H | 
|  | movi	$t3.16b, #0xe1 | 
|  | shl	$t3.2d, $t3.2d, #57		// 0xc2.0 | 
|  | ext	$INlo.16b, $t1.16b, $t1.16b, #8 | 
|  | ushr	$t2.2d, $t3.2d, #63 | 
|  | dup	$t1.4s, $t1.s[1] | 
|  | ext	$t0.16b, $t2.16b, $t3.16b, #8	// t0=0xc2....01 | 
|  | ushr	$t2.2d, $INlo.2d, #63 | 
|  | sshr	$t1.4s, $t1.4s, #31		// broadcast carry bit | 
|  | and	$t2.16b, $t2.16b, $t0.16b | 
|  | shl	$INlo.2d, $INlo.2d, #1 | 
|  | ext	$t2.16b, $t2.16b, $t2.16b, #8 | 
|  | and	$t0.16b, $t0.16b, $t1.16b | 
|  | orr	$INlo.16b, $INlo.16b, $t2.16b	// H<<<=1 | 
|  | eor	$Hlo.16b, $INlo.16b, $t0.16b	// twisted H | 
|  | st1	{$Hlo.2d}, [x0]			// store Htable[0] | 
|  | ret | 
|  | .size	gcm_init_neon,.-gcm_init_neon | 
|  |  | 
|  | .global	gcm_gmult_neon | 
|  | .type	gcm_gmult_neon,%function | 
|  | .align	4 | 
|  | gcm_gmult_neon: | 
|  | ld1	{$INlo.16b}, [$Xi]		// load Xi | 
|  | ld1	{$Hlo.1d}, [$Htbl], #8		// load twisted H | 
|  | ld1	{$Hhi.1d}, [$Htbl] | 
|  | adrp	x9, :pg_hi21:.Lmasks		// load constants | 
|  | add	x9, x9, :lo12:.Lmasks | 
|  | ld1	{$k48_k32.2d, $k16_k0.2d}, [x9] | 
|  | rev64	$INlo.16b, $INlo.16b		// byteswap Xi | 
|  | ext	$INlo.16b, $INlo.16b, $INlo.16b, #8 | 
|  | eor	$Hhl.8b, $Hlo.8b, $Hhi.8b	// Karatsuba pre-processing | 
|  |  | 
|  | mov	$len, #16 | 
|  | b	.Lgmult_neon | 
|  | .size	gcm_gmult_neon,.-gcm_gmult_neon | 
|  |  | 
|  | .global	gcm_ghash_neon | 
|  | .type	gcm_ghash_neon,%function | 
|  | .align	4 | 
|  | gcm_ghash_neon: | 
|  | ld1	{$Xl.16b}, [$Xi]		// load Xi | 
|  | ld1	{$Hlo.1d}, [$Htbl], #8		// load twisted H | 
|  | ld1	{$Hhi.1d}, [$Htbl] | 
|  | adrp	x9, :pg_hi21:.Lmasks		// load constants | 
|  | add	x9, x9, :lo12:.Lmasks | 
|  | ld1	{$k48_k32.2d, $k16_k0.2d}, [x9] | 
|  | rev64	$Xl.16b, $Xl.16b		// byteswap Xi | 
|  | ext	$Xl.16b, $Xl.16b, $Xl.16b, #8 | 
|  | eor	$Hhl.8b, $Hlo.8b, $Hhi.8b	// Karatsuba pre-processing | 
|  |  | 
|  | .Loop_neon: | 
|  | ld1	{$INlo.16b}, [$inp], #16	// load inp | 
|  | rev64	$INlo.16b, $INlo.16b		// byteswap inp | 
|  | ext	$INlo.16b, $INlo.16b, $INlo.16b, #8 | 
|  | eor	$INlo.16b, $INlo.16b, $Xl.16b	// inp ^= Xi | 
|  |  | 
|  | .Lgmult_neon: | 
|  | // Split the input into $INlo and $INhi. (The upper halves are unused, | 
|  | // so it is okay to leave them alone.) | 
|  | ins	$INhi.d[0], $INlo.d[1] | 
|  | ___ | 
|  | &clmul64x64	($Xl, $Hlo, $INlo);		# H.lo·Xi.lo | 
|  | $code .= <<___; | 
|  | eor	$INlo.8b, $INlo.8b, $INhi.8b	// Karatsuba pre-processing | 
|  | ___ | 
|  | &clmul64x64	($Xm, $Hhl, $INlo);		# (H.lo+H.hi)·(Xi.lo+Xi.hi) | 
|  | &clmul64x64	($Xh, $Hhi, $INhi);		# H.hi·Xi.hi | 
|  | $code .= <<___; | 
|  | ext	$t0.16b, $Xl.16b, $Xh.16b, #8 | 
|  | eor	$Xm.16b, $Xm.16b, $Xl.16b	// Karatsuba post-processing | 
|  | eor	$Xm.16b, $Xm.16b, $Xh.16b | 
|  | eor	$Xm.16b, $Xm.16b, $t0.16b	// Xm overlaps Xh.lo and Xl.hi | 
|  | ins	$Xl.d[1], $Xm.d[0]		// Xh|Xl - 256-bit result | 
|  | // This is a no-op due to the ins instruction below. | 
|  | // ins	$Xh.d[0], $Xm.d[1] | 
|  |  | 
|  | // equivalent of reduction_avx from ghash-x86_64.pl | 
|  | shl	$t1.2d, $Xl.2d, #57		// 1st phase | 
|  | shl	$t2.2d, $Xl.2d, #62 | 
|  | eor	$t2.16b, $t2.16b, $t1.16b	// | 
|  | shl	$t1.2d, $Xl.2d, #63 | 
|  | eor	$t2.16b, $t2.16b, $t1.16b	// | 
|  | // Note Xm contains {Xl.d[1], Xh.d[0]}. | 
|  | eor	$t2.16b, $t2.16b, $Xm.16b | 
|  | ins	$Xl.d[1], $t2.d[0]		// Xl.d[1] ^= t2.d[0] | 
|  | ins	$Xh.d[0], $t2.d[1]		// Xh.d[0] ^= t2.d[1] | 
|  |  | 
|  | ushr	$t2.2d, $Xl.2d, #1		// 2nd phase | 
|  | eor	$Xh.16b, $Xh.16b,$Xl.16b | 
|  | eor	$Xl.16b, $Xl.16b,$t2.16b	// | 
|  | ushr	$t2.2d, $t2.2d, #6 | 
|  | ushr	$Xl.2d, $Xl.2d, #1		// | 
|  | eor	$Xl.16b, $Xl.16b, $Xh.16b	// | 
|  | eor	$Xl.16b, $Xl.16b, $t2.16b	// | 
|  |  | 
|  | subs	$len, $len, #16 | 
|  | bne	.Loop_neon | 
|  |  | 
|  | rev64	$Xl.16b, $Xl.16b		// byteswap Xi and write | 
|  | ext	$Xl.16b, $Xl.16b, $Xl.16b, #8 | 
|  | st1	{$Xl.16b}, [$Xi] | 
|  |  | 
|  | ret | 
|  | .size	gcm_ghash_neon,.-gcm_ghash_neon | 
|  |  | 
|  | .section	.rodata | 
|  | .align	4 | 
|  | .Lmasks: | 
|  | .quad	0x0000ffffffffffff	// k48 | 
|  | .quad	0x00000000ffffffff	// k32 | 
|  | .quad	0x000000000000ffff	// k16 | 
|  | .quad	0x0000000000000000	// k0 | 
|  | .asciz  "GHASH for ARMv8, derived from ARMv4 version by <appro\@openssl.org>" | 
|  | .align  2 | 
|  | ___ | 
|  |  | 
|  | foreach (split("\n",$code)) { | 
|  | s/\`([^\`]*)\`/eval $1/geo; | 
|  |  | 
|  | print $_,"\n"; | 
|  | } | 
|  | close STDOUT or die "error closing STDOUT"; # enforce flush |