Blame - crypto/fipsmodule/sha/asm/sha1-x86_64.pl - boringssl

blob: 25a1cad007e643b851cfedff9aca70eba43d3b34 [file] [log] [blame]

Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	1	#!/usr/bin/env perl
				2	#
				3	# ====================================================================
				4	# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
				5	# project. The module is, however, dual licensed under OpenSSL and
				6	# CRYPTOGAMS licenses depending on where you obtain it. For further
				7	# details see http://www.openssl.org/~appro/cryptogams/.
				8	# ====================================================================
				9	#
				10	# sha1_block procedure for x86_64.
				11	#
				12	# It was brought to my attention that on EM64T compiler-generated code
				13	# was far behind 32-bit assembler implementation. This is unlike on
				14	# Opteron where compiler-generated code was only 15% behind 32-bit
				15	# assembler, which originally made it hard to motivate the effort.
				16	# There was suggestion to mechanically translate 32-bit code, but I
				17	# dismissed it, reasoning that x86_64 offers enough register bank
				18	# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
				19	# implementation:-) However! While 64-bit code does perform better
				20	# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
				21	# x86_64 does offer larger addressable bank, but out-of-order core
				22	# reaches for even more registers through dynamic aliasing, and EM64T
				23	# core must have managed to run-time optimize even 32-bit code just as
				24	# good as 64-bit one. Performance improvement is summarized in the
				25	# following table:
				26	#
				27	# gcc 3.4 32-bit asm cycles/byte
				28	# Opteron +45% +20% 6.8
				29	# Xeon P4 +65% +0% 9.9
				30	# Core2 +60% +10% 7.0
				31
				32	# August 2009.
				33	#
				34	# The code was revised to minimize code size and to maximize
				35	# "distance" between instructions producing input to 'lea'
				36	# instruction and the 'lea' instruction itself, which is essential
				37	# for Intel Atom core.
				38
				39	# October 2010.
				40	#
				41	# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
				42	# is to offload message schedule denoted by Wt in NIST specification,
				43	# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
				44	# for background and implementation details. The only difference from
				45	# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
				46	# to free temporary registers.
				47
				48	# April 2011.
				49	#
				50	# Add AVX code path. See sha1-586.pl for further information.
				51
				52	# May 2013.
				53	#
				54	# Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions
				55	# and loading pair of consecutive blocks to 256-bit %ymm registers)
				56	# did not provide impressive performance improvement till a crucial
				57	# hint regarding the number of Xupdate iterations to pre-compute in
				58	# advance was provided by Ilya Albrekht of Intel Corp.
				59
Adam Langley	cb5dd63	2014-06-20 12:00:00 -0700	[diff] [blame]	60	# March 2014.
				61	#
				62	# Add support for Intel SHA Extensions.
				63
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	64	######################################################################
				65	# Current performance is summarized in following table. Numbers are
				66	# CPU clock cycles spent to process single byte (less is better).
				67	#
				68	# x86_64 SSSE3 AVX[2]
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	69	# P4 9.05 -
				70	# Opteron 6.26 -
				71	# Core2 6.55 6.05/+8% -
				72	# Westmere 6.73 5.30/+27% -
				73	# Sandy Bridge 7.70 6.10/+26% 4.99/+54%
				74	# Ivy Bridge 6.06 4.67/+30% 4.60/+32%
				75	# Haswell 5.45 4.15/+31% 3.57/+53%
Adam Langley	86c0692	2017-02-09 12:26:22 -0800	[diff] [blame]	76	# Skylake 5.18 4.06/+28% 3.54/+46%
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	77	# Bulldozer 9.11 5.95/+53%
				78	# VIA Nano 9.32 7.15/+30%
Adam Langley	cb5dd63	2014-06-20 12:00:00 -0700	[diff] [blame]	79	# Atom 10.3 9.17/+12%
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	80	# Silvermont 13.1(*) 9.37/+40%
Adam Langley	ff7fb71	2017-02-09 12:34:59 -0800	[diff] [blame]	81	# Goldmont 8.13 6.42/+27% 1.70/+380%(**)
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	82	#
				83	# (*) obviously suboptimal result, nothing was done about it,
				84	# because SSSE3 code is compiled unconditionally;
Adam Langley	ff7fb71	2017-02-09 12:34:59 -0800	[diff] [blame]	85	# (**) SHAEXT result
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	86
				87	$flavour = shift;
				88	$output = shift;
				89	if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
				90
				91	$win64=0; $win64=1 if ($flavour =~ /[nm]asm\|mingw64/ \|\| $output =~ /\.asm$/);
				92
				93	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
				94	( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
Adam Langley	fd49993	2017-04-04 14:21:43 -0700	[diff] [blame^]	95	( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	96	die "can't locate x86_64-xlate.pl";
				97
David Benjamin	278d342	2015-10-14 14:03:23 -0400	[diff] [blame]	98	# In upstream, this is controlled by shelling out to the compiler to check
				99	# versions, but BoringSSL is intended to be used with pre-generated perlasm
				100	# output, so this isn't useful anyway.
				101	#
David Benjamin	ce7ae6f	2015-11-09 22:07:24 -0500	[diff] [blame]	102	# TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it
				103	# necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream
				104	# did not tie them together until after $shaext was added.
				105	$avx = 1;
Adam Langley	2811da2	2014-07-24 17:28:34 -0700	[diff] [blame]	106
David Benjamin	e189c86	2015-10-15 13:48:50 -0400	[diff] [blame]	107	# TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
				108	# been tested.
Adam Langley	006779a	2014-06-20 12:00:00 -0700	[diff] [blame]	109	$shaext=0; ### set to zero if compiling for 1.0.1
				110	$avx=1 if (!$shaext && $avx);
				111
David Benjamin	fdd8e9c	2016-06-26 13:18:50 -0400	[diff] [blame]	112	open OUT,"\| \"$^X\" \"$xlate\" $flavour \"$output\"";
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	113	STDOUT=OUT;
				114
				115	$ctx="%rdi"; # 1st arg
				116	$inp="%rsi"; # 2nd arg
				117	$num="%rdx"; # 3rd arg
				118
				119	# reassign arguments in order to produce more compact code
				120	$ctx="%r8";
				121	$inp="%r9";
				122	$num="%r10";
				123
				124	$t0="%eax";
				125	$t1="%ebx";
				126	$t2="%ecx";
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	127	@xi=("%edx","%ebp","%r14d");
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	128	$A="%esi";
				129	$B="%edi";
				130	$C="%r11d";
				131	$D="%r12d";
				132	$E="%r13d";
				133
				134	@V=($A,$B,$C,$D,$E);
				135
				136	sub BODY_00_19 {
				137	my ($i,$a,$b,$c,$d,$e)=@_;
				138	my $j=$i+1;
				139	$code.=<<___ if ($i==0);
				140	mov `4*$i`($inp),$xi[0]
				141	bswap $xi[0]
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	142	___
				143	$code.=<<___ if ($i<15);
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	144	mov `4*$j`($inp),$xi[1]
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	145	mov $d,$t0
				146	mov $xi[0],`4*$i`(%rsp)
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	147	mov $a,$t2
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	148	bswap $xi[1]
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	149	xor $c,$t0
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	150	rol \$5,$t2
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	151	and $b,$t0
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	152	lea 0x5a827999($xi[0],$e),$e
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	153	add $t2,$e
				154	xor $d,$t0
				155	rol \$30,$b
				156	add $t0,$e
				157	___
				158	$code.=<<___ if ($i>=15);
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	159	xor `4*($j%16)`(%rsp),$xi[1]
				160	mov $d,$t0
				161	mov $xi[0],`4*($i%16)`(%rsp)
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	162	mov $a,$t2
				163	xor `4*(($j+2)%16)`(%rsp),$xi[1]
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	164	xor $c,$t0
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	165	rol \$5,$t2
				166	xor `4*(($j+8)%16)`(%rsp),$xi[1]
				167	and $b,$t0
				168	lea 0x5a827999($xi[0],$e),$e
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	169	rol \$30,$b
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	170	xor $d,$t0
				171	add $t2,$e
				172	rol \$1,$xi[1]
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	173	add $t0,$e
				174	___
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	175	push(@xi,shift(@xi));
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	176	}
				177
				178	sub BODY_20_39 {
				179	my ($i,$a,$b,$c,$d,$e)=@_;
				180	my $j=$i+1;
				181	my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
				182	$code.=<<___ if ($i<79);
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	183	xor `4*($j%16)`(%rsp),$xi[1]
				184	mov $b,$t0
				185	`"mov $xi[0],".4*($i%16)."(%rsp)" if ($i<72)`
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	186	mov $a,$t2
				187	xor `4*(($j+2)%16)`(%rsp),$xi[1]
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	188	xor $d,$t0
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	189	rol \$5,$t2
				190	xor `4*(($j+8)%16)`(%rsp),$xi[1]
				191	lea $K($xi[0],$e),$e
				192	xor $c,$t0
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	193	add $t2,$e
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	194	rol \$30,$b
				195	add $t0,$e
				196	rol \$1,$xi[1]
				197	___
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	198	$code.=<<___ if ($i==79);
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	199	mov $b,$t0
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	200	mov $a,$t2
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	201	xor $d,$t0
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	202	lea $K($xi[0],$e),$e
				203	rol \$5,$t2
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	204	xor $c,$t0
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	205	add $t2,$e
				206	rol \$30,$b
				207	add $t0,$e
				208	___
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	209	push(@xi,shift(@xi));
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	210	}
				211
				212	sub BODY_40_59 {
				213	my ($i,$a,$b,$c,$d,$e)=@_;
				214	my $j=$i+1;
				215	$code.=<<___;
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	216	xor `4*($j%16)`(%rsp),$xi[1]
				217	mov $d,$t0
				218	mov $xi[0],`4*($i%16)`(%rsp)
				219	mov $d,$t1
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	220	xor `4*(($j+2)%16)`(%rsp),$xi[1]
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	221	and $c,$t0
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	222	mov $a,$t2
				223	xor `4*(($j+8)%16)`(%rsp),$xi[1]
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	224	lea 0x8f1bbcdc($xi[0],$e),$e
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	225	xor $c,$t1
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	226	rol \$5,$t2
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	227	add $t0,$e
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	228	rol \$1,$xi[1]
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	229	and $b,$t1
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	230	add $t2,$e
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	231	rol \$30,$b
				232	add $t1,$e
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	233	___
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	234	push(@xi,shift(@xi));
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	235	}
				236
				237	$code.=<<___;
				238	.text
Adam Langley	fd49993	2017-04-04 14:21:43 -0700	[diff] [blame^]	239	.extern OPENSSL_ia32cap_addr
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	240
				241	.globl sha1_block_data_order
				242	.type sha1_block_data_order,\@function,3
				243	.align 16
				244	sha1_block_data_order:
Adam Langley	fd49993	2017-04-04 14:21:43 -0700	[diff] [blame^]	245	lea OPENSSL_ia32cap_addr(%rip),%r10
				246	mov (%r10),%r10
				247	mov 0(%r10),%r9d
				248	mov 4(%r10),%r8d
				249	mov 8(%r10),%r10d
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	250	test \$`1<<9`,%r8d # check SSSE3 bit
				251	jz .Lialu
Adam Langley	006779a	2014-06-20 12:00:00 -0700	[diff] [blame]	252	___
				253	$code.=<<___ if ($shaext);
Adam Langley	c948d46	2017-02-09 12:21:08 -0800	[diff] [blame]	254	test \$`1<<29`,%r10d # check SHA bit
Adam Langley	cb5dd63	2014-06-20 12:00:00 -0700	[diff] [blame]	255	jnz _shaext_shortcut
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	256	___
				257	$code.=<<___ if ($avx>1);
				258	and \$`1<<3\|1<<5\|1<<8`,%r10d # check AVX2+BMI1+BMI2
				259	cmp \$`1<<3\|1<<5\|1<<8`,%r10d
				260	je _avx2_shortcut
				261	___
				262	$code.=<<___ if ($avx);
				263	and \$`1<<28`,%r8d # mask AVX bit
				264	and \$`1<<30`,%r9d # mask "Intel CPU" bit
				265	or %r9d,%r8d
				266	cmp \$`1<<28\|1<<30`,%r8d
				267	je _avx_shortcut
				268	___
				269	$code.=<<___;
				270	jmp _ssse3_shortcut
				271
				272	.align 16
				273	.Lialu:
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	274	mov %rsp,%rax
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	275	push %rbx
				276	push %rbp
				277	push %r12
				278	push %r13
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	279	push %r14
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	280	mov %rdi,$ctx # reassigned argument
				281	sub \$`8+16*4`,%rsp
				282	mov %rsi,$inp # reassigned argument
				283	and \$-64,%rsp
				284	mov %rdx,$num # reassigned argument
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	285	mov %rax,`16*4`(%rsp)
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	286	.Lprologue:
				287
				288	mov 0($ctx),$A
				289	mov 4($ctx),$B
				290	mov 8($ctx),$C
				291	mov 12($ctx),$D
				292	mov 16($ctx),$E
				293	jmp .Lloop
				294
				295	.align 16
				296	.Lloop:
				297	___
				298	for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
				299	for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
				300	for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
				301	for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
				302	$code.=<<___;
				303	add 0($ctx),$A
				304	add 4($ctx),$B
				305	add 8($ctx),$C
				306	add 12($ctx),$D
				307	add 16($ctx),$E
				308	mov $A,0($ctx)
				309	mov $B,4($ctx)
				310	mov $C,8($ctx)
				311	mov $D,12($ctx)
				312	mov $E,16($ctx)
				313
				314	sub \$1,$num
				315	lea `16*4`($inp),$inp
				316	jnz .Lloop
				317
				318	mov `16*4`(%rsp),%rsi
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	319	mov -40(%rsi),%r14
				320	mov -32(%rsi),%r13
				321	mov -24(%rsi),%r12
				322	mov -16(%rsi),%rbp
				323	mov -8(%rsi),%rbx
				324	lea (%rsi),%rsp
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	325	.Lepilogue:
				326	ret
				327	.size sha1_block_data_order,.-sha1_block_data_order
				328	___
Adam Langley	006779a	2014-06-20 12:00:00 -0700	[diff] [blame]	329	if ($shaext) {{{
Adam Langley	cb5dd63	2014-06-20 12:00:00 -0700	[diff] [blame]	330	######################################################################
				331	# Intel SHA Extensions implementation of SHA1 update function.
				332	#
				333	my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
				334	my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
				335	my @MSG=map("%xmm$_",(4..7));
				336
				337	$code.=<<___;
				338	.type sha1_block_data_order_shaext,\@function,3
				339	.align 32
				340	sha1_block_data_order_shaext:
				341	_shaext_shortcut:
				342	___
				343	$code.=<<___ if ($win64);
				344	lea `-8-4*16`(%rsp),%rsp
				345	movaps %xmm6,-8-4*16(%rax)
				346	movaps %xmm7,-8-3*16(%rax)
				347	movaps %xmm8,-8-2*16(%rax)
				348	movaps %xmm9,-8-1*16(%rax)
				349	.Lprologue_shaext:
				350	___
				351	$code.=<<___;
				352	movdqu ($ctx),$ABCD
				353	movd 16($ctx),$E
				354	movdqa K_XX_XX+0xa0(%rip),$BSWAP # byte-n-word swap
				355
				356	movdqu ($inp),@MSG[0]
				357	pshufd \$0b00011011,$ABCD,$ABCD # flip word order
				358	movdqu 0x10($inp),@MSG[1]
				359	pshufd \$0b00011011,$E,$E # flip word order
				360	movdqu 0x20($inp),@MSG[2]
				361	pshufb $BSWAP,@MSG[0]
				362	movdqu 0x30($inp),@MSG[3]
				363	pshufb $BSWAP,@MSG[1]
				364	pshufb $BSWAP,@MSG[2]
				365	movdqa $E,$E_SAVE # offload $E
				366	pshufb $BSWAP,@MSG[3]
				367	jmp .Loop_shaext
				368
				369	.align 16
				370	.Loop_shaext:
				371	dec $num
David Benjamin	722ba2d	2016-08-16 01:54:53 -0400	[diff] [blame]	372	lea 0x40($inp),%r8 # next input block
Adam Langley	cb5dd63	2014-06-20 12:00:00 -0700	[diff] [blame]	373	paddd @MSG[0],$E
David Benjamin	722ba2d	2016-08-16 01:54:53 -0400	[diff] [blame]	374	cmovne %r8,$inp
Adam Langley	cb5dd63	2014-06-20 12:00:00 -0700	[diff] [blame]	375	movdqa $ABCD,$ABCD_SAVE # offload $ABCD
				376	___
				377	for($i=0;$i<20-4;$i+=2) {
				378	$code.=<<___;
				379	sha1msg1 @MSG[1],@MSG[0]
				380	movdqa $ABCD,$E_
				381	sha1rnds4 \$`int($i/5)`,$E,$ABCD # 0-3...
				382	sha1nexte @MSG[1],$E_
				383	pxor @MSG[2],@MSG[0]
				384	sha1msg1 @MSG[2],@MSG[1]
				385	sha1msg2 @MSG[3],@MSG[0]
				386
				387	movdqa $ABCD,$E
				388	sha1rnds4 \$`int(($i+1)/5)`,$E_,$ABCD
				389	sha1nexte @MSG[2],$E
				390	pxor @MSG[3],@MSG[1]
				391	sha1msg2 @MSG[0],@MSG[1]
				392	___
				393	push(@MSG,shift(@MSG)); push(@MSG,shift(@MSG));
				394	}
				395	$code.=<<___;
				396	movdqu ($inp),@MSG[0]
				397	movdqa $ABCD,$E_
				398	sha1rnds4 \$3,$E,$ABCD # 64-67
				399	sha1nexte @MSG[1],$E_
				400	movdqu 0x10($inp),@MSG[1]
				401	pshufb $BSWAP,@MSG[0]
				402
				403	movdqa $ABCD,$E
				404	sha1rnds4 \$3,$E_,$ABCD # 68-71
				405	sha1nexte @MSG[2],$E
				406	movdqu 0x20($inp),@MSG[2]
				407	pshufb $BSWAP,@MSG[1]
				408
				409	movdqa $ABCD,$E_
				410	sha1rnds4 \$3,$E,$ABCD # 72-75
				411	sha1nexte @MSG[3],$E_
				412	movdqu 0x30($inp),@MSG[3]
				413	pshufb $BSWAP,@MSG[2]
				414
				415	movdqa $ABCD,$E
				416	sha1rnds4 \$3,$E_,$ABCD # 76-79
				417	sha1nexte $E_SAVE,$E
				418	pshufb $BSWAP,@MSG[3]
				419
				420	paddd $ABCD_SAVE,$ABCD
				421	movdqa $E,$E_SAVE # offload $E
				422
				423	jnz .Loop_shaext
				424
				425	pshufd \$0b00011011,$ABCD,$ABCD
				426	pshufd \$0b00011011,$E,$E
				427	movdqu $ABCD,($ctx)
				428	movd $E,16($ctx)
				429	___
				430	$code.=<<___ if ($win64);
				431	movaps -8-4*16(%rax),%xmm6
				432	movaps -8-3*16(%rax),%xmm7
				433	movaps -8-2*16(%rax),%xmm8
				434	movaps -8-1*16(%rax),%xmm9
				435	mov %rax,%rsp
				436	.Lepilogue_shaext:
				437	___
				438	$code.=<<___;
				439	ret
				440	.size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
				441	___
				442	}}}
				443	{{{
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	444	my $Xi=4;
				445	my @X=map("%xmm$_",(4..7,0..3));
				446	my @Tx=map("%xmm$_",(8..10));
				447	my $Kx="%xmm11";
				448	my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
				449	my @T=("%esi","%edi");
				450	my $j=0;
				451	my $rx=0;
Adam Langley	cb1b333	2017-02-09 14:17:39 -0800	[diff] [blame]	452	my $K_XX_XX="%r14";
				453	my $fp="%r11";
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	454
				455	my $_rol=sub { &rol(@_) };
				456	my $_ror=sub { &ror(@_) };
				457
				458	{ my $sn;
				459	sub align32() {
				460	++$sn;
				461	$code.=<<___;
				462	jmp .Lalign32_$sn # see "Decoded ICache" in manual
				463	.align 32
				464	.Lalign32_$sn:
				465	___
				466	}
				467	}
				468
				469	$code.=<<___;
				470	.type sha1_block_data_order_ssse3,\@function,3
				471	.align 16
				472	sha1_block_data_order_ssse3:
				473	_ssse3_shortcut:
Adam Langley	cb1b333	2017-02-09 14:17:39 -0800	[diff] [blame]	474	mov %rsp,$fp # frame pointer
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	475	push %rbx
				476	push %rbp
				477	push %r12
				478	push %r13 # redundant, done to share Win64 SE handler
				479	push %r14
				480	lea `-64-($win64?6*16:0)`(%rsp),%rsp
				481	___
				482	$code.=<<___ if ($win64);
Adam Langley	cb1b333	2017-02-09 14:17:39 -0800	[diff] [blame]	483	movaps %xmm6,-40-6*16($fp)
				484	movaps %xmm7,-40-5*16($fp)
				485	movaps %xmm8,-40-4*16($fp)
				486	movaps %xmm9,-40-3*16($fp)
				487	movaps %xmm10,-40-2*16($fp)
				488	movaps %xmm11,-40-1*16($fp)
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	489	.Lprologue_ssse3:
				490	___
				491	$code.=<<___;
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	492	and \$-64,%rsp
				493	mov %rdi,$ctx # reassigned argument
				494	mov %rsi,$inp # reassigned argument
				495	mov %rdx,$num # reassigned argument
				496
				497	shl \$6,$num
				498	add $inp,$num
				499	lea K_XX_XX+64(%rip),$K_XX_XX
				500
				501	mov 0($ctx),$A # load context
				502	mov 4($ctx),$B
				503	mov 8($ctx),$C
				504	mov 12($ctx),$D
				505	mov $B,@T[0] # magic seed
				506	mov 16($ctx),$E
				507	mov $C,@T[1]
				508	xor $D,@T[1]
				509	and @T[1],@T[0]
				510
				511	movdqa 64($K_XX_XX),@X[2] # pbswap mask
				512	movdqa -64($K_XX_XX),@Tx[1] # K_00_19
				513	movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
				514	movdqu 16($inp),@X[-3&7]
				515	movdqu 32($inp),@X[-2&7]
				516	movdqu 48($inp),@X[-1&7]
				517	pshufb @X[2],@X[-4&7] # byte swap
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	518	pshufb @X[2],@X[-3&7]
				519	pshufb @X[2],@X[-2&7]
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	520	add \$64,$inp
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	521	paddd @Tx[1],@X[-4&7] # add K_00_19
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	522	pshufb @X[2],@X[-1&7]
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	523	paddd @Tx[1],@X[-3&7]
				524	paddd @Tx[1],@X[-2&7]
				525	movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
				526	psubd @Tx[1],@X[-4&7] # restore X[]
				527	movdqa @X[-3&7],16(%rsp)
				528	psubd @Tx[1],@X[-3&7]
				529	movdqa @X[-2&7],32(%rsp)
				530	psubd @Tx[1],@X[-2&7]
				531	jmp .Loop_ssse3
				532	___
				533
				534	sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
				535	{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
				536	my $arg = pop;
				537	$arg = "\$$arg" if ($arg*1 eq $arg);
				538	$code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
				539	}
				540
				541	sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
				542	{ use integer;
				543	my $body = shift;
				544	my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
				545	my ($a,$b,$c,$d,$e);
				546
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	547	eval(shift(@insns)); # ror
				548	&pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	549	eval(shift(@insns));
				550	&movdqa (@Tx[0],@X[-1&7]);
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	551	&paddd (@Tx[1],@X[-1&7]);
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	552	eval(shift(@insns));
				553	eval(shift(@insns));
				554
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	555	&punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	556	eval(shift(@insns));
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	557	eval(shift(@insns)); # rol
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	558	eval(shift(@insns));
				559	&psrldq (@Tx[0],4); # "X[-3]", 3 dwords
				560	eval(shift(@insns));
				561	eval(shift(@insns));
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	562
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	563	&pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
				564	eval(shift(@insns));
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	565	eval(shift(@insns)); # ror
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	566	&pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
				567	eval(shift(@insns));
				568	eval(shift(@insns));
				569	eval(shift(@insns));
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	570
				571	&pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
				572	eval(shift(@insns));
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	573	eval(shift(@insns)); # rol
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	574	&movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
				575	eval(shift(@insns));
				576	eval(shift(@insns));
				577
				578	&movdqa (@Tx[2],@X[0]);
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	579	eval(shift(@insns));
				580	eval(shift(@insns));
				581	eval(shift(@insns)); # ror
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	582	&movdqa (@Tx[0],@X[0]);
				583	eval(shift(@insns));
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	584
				585	&pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
				586	&paddd (@X[0],@X[0]);
				587	eval(shift(@insns));
				588	eval(shift(@insns));
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	589
				590	&psrld (@Tx[0],31);
				591	eval(shift(@insns));
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	592	eval(shift(@insns)); # rol
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	593	eval(shift(@insns));
				594	&movdqa (@Tx[1],@Tx[2]);
				595	eval(shift(@insns));
				596	eval(shift(@insns));
				597
				598	&psrld (@Tx[2],30);
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	599	eval(shift(@insns));
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	600	eval(shift(@insns)); # ror
				601	&por (@X[0],@Tx[0]); # "X[0]"<<<=1
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	602	eval(shift(@insns));
				603	eval(shift(@insns));
				604	eval(shift(@insns));
				605
				606	&pslld (@Tx[1],2);
				607	&pxor (@X[0],@Tx[2]);
				608	eval(shift(@insns));
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	609	&movdqa (@Tx[2],eval(216(($Xi)/5)-64)."($K_XX_XX)"); # K_XX_XX
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	610	eval(shift(@insns)); # rol
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	611	eval(shift(@insns));
				612	eval(shift(@insns));
				613
				614	&pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	615	&pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	616
				617	foreach (@insns) { eval; } # remaining instructions [if any]
				618
				619	$Xi++; push(@X,shift(@X)); # "rotate" X[]
				620	push(@Tx,shift(@Tx));
				621	}
				622
				623	sub Xupdate_ssse3_32_79()
				624	{ use integer;
				625	my $body = shift;
				626	my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
				627	my ($a,$b,$c,$d,$e);
				628
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	629	eval(shift(@insns)) if ($Xi==8);
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	630	&pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	631	eval(shift(@insns)) if ($Xi==8);
				632	eval(shift(@insns)); # body_20_39
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	633	eval(shift(@insns));
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	634	eval(shift(@insns)) if (@insns[1] =~ /_ror/);
				635	eval(shift(@insns)) if (@insns[0] =~ /_ror/);
				636	&punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	637	eval(shift(@insns));
				638	eval(shift(@insns)); # rol
				639
				640	&pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
				641	eval(shift(@insns));
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	642	eval(shift(@insns));
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	643	if ($Xi%5) {
				644	&movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
				645	} else { # ... or load next one
				646	&movdqa (@Tx[2],eval(216($Xi/5)-64)."($K_XX_XX)");
				647	}
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	648	eval(shift(@insns)); # ror
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	649	&paddd (@Tx[1],@X[-1&7]);
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	650	eval(shift(@insns));
				651
				652	&pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
				653	eval(shift(@insns)); # body_20_39
				654	eval(shift(@insns));
				655	eval(shift(@insns));
				656	eval(shift(@insns)); # rol
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	657	eval(shift(@insns)) if (@insns[0] =~ /_ror/);
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	658
				659	&movdqa (@Tx[0],@X[0]);
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	660	eval(shift(@insns));
				661	eval(shift(@insns));
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	662	&movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	663	eval(shift(@insns)); # ror
				664	eval(shift(@insns));
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	665	eval(shift(@insns)); # body_20_39
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	666
				667	&pslld (@X[0],2);
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	668	eval(shift(@insns));
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	669	eval(shift(@insns));
				670	&psrld (@Tx[0],30);
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	671	eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	672	eval(shift(@insns));
				673	eval(shift(@insns));
				674	eval(shift(@insns)); # ror
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	675
				676	&por (@X[0],@Tx[0]); # "X[0]"<<<=2
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	677	eval(shift(@insns));
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	678	eval(shift(@insns)); # body_20_39
				679	eval(shift(@insns)) if (@insns[1] =~ /_rol/);
				680	eval(shift(@insns)) if (@insns[0] =~ /_rol/);
				681	&pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0])
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	682	eval(shift(@insns));
				683	eval(shift(@insns)); # rol
				684	eval(shift(@insns));
				685	eval(shift(@insns));
				686	eval(shift(@insns)); # rol
				687	eval(shift(@insns));
				688
				689	foreach (@insns) { eval; } # remaining instructions
				690
				691	$Xi++; push(@X,shift(@X)); # "rotate" X[]
				692	push(@Tx,shift(@Tx));
				693	}
				694
				695	sub Xuplast_ssse3_80()
				696	{ use integer;
				697	my $body = shift;
				698	my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
				699	my ($a,$b,$c,$d,$e);
				700
				701	eval(shift(@insns));
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	702	eval(shift(@insns));
				703	eval(shift(@insns));
				704	eval(shift(@insns));
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	705	&paddd (@Tx[1],@X[-1&7]);
				706	eval(shift(@insns));
				707	eval(shift(@insns));
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	708
				709	&movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
				710
				711	foreach (@insns) { eval; } # remaining instructions
				712
				713	&cmp ($inp,$num);
				714	&je (".Ldone_ssse3");
				715
				716	unshift(@Tx,pop(@Tx));
				717
				718	&movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask
				719	&movdqa (@Tx[1],"-64($K_XX_XX)"); # K_00_19
				720	&movdqu (@X[-4&7],"0($inp)"); # load input
				721	&movdqu (@X[-3&7],"16($inp)");
				722	&movdqu (@X[-2&7],"32($inp)");
				723	&movdqu (@X[-1&7],"48($inp)");
				724	&pshufb (@X[-4&7],@X[2]); # byte swap
				725	&add ($inp,64);
				726
				727	$Xi=0;
				728	}
				729
				730	sub Xloop_ssse3()
				731	{ use integer;
				732	my $body = shift;
				733	my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
				734	my ($a,$b,$c,$d,$e);
				735
				736	eval(shift(@insns));
				737	eval(shift(@insns));
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	738	eval(shift(@insns));
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	739	&pshufb (@X[($Xi-3)&7],@X[2]);
				740	eval(shift(@insns));
				741	eval(shift(@insns));
				742	eval(shift(@insns));
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	743	eval(shift(@insns));
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	744	&paddd (@X[($Xi-4)&7],@Tx[1]);
				745	eval(shift(@insns));
				746	eval(shift(@insns));
				747	eval(shift(@insns));
				748	eval(shift(@insns));
				749	&movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
				750	eval(shift(@insns));
				751	eval(shift(@insns));
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	752	eval(shift(@insns));
				753	eval(shift(@insns));
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	754	&psubd (@X[($Xi-4)&7],@Tx[1]);
				755
				756	foreach (@insns) { eval; }
				757	$Xi++;
				758	}
				759
				760	sub Xtail_ssse3()
				761	{ use integer;
				762	my $body = shift;
				763	my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
				764	my ($a,$b,$c,$d,$e);
				765
				766	foreach (@insns) { eval; }
				767	}
				768
				769	sub body_00_19 () { # ((c^d)&b)^d
				770	# on start @T[0]=(c^d)&b
				771	return &body_20_39() if ($rx==19); $rx++;
				772	(
				773	'($a,$b,$c,$d,$e)=@V;'.
				774	'&$_ror ($b,$j?7:2)', # $b>>>2
				775	'&xor (@T[0],$d)',
				776	'&mov (@T[1],$a)', # $b for next round
				777
				778	'&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
				779	'&xor ($b,$c)', # $c^$d for next round
				780
				781	'&$_rol ($a,5)',
				782	'&add ($e,@T[0])',
				783	'&and (@T[1],$b)', # ($b&($c^$d)) for next round
				784
				785	'&xor ($b,$c)', # restore $b
				786	'&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
				787	);
				788	}
				789
				790	sub body_20_39 () { # b^d^c
				791	# on entry @T[0]=b^d
				792	return &body_40_59() if ($rx==39); $rx++;
				793	(
				794	'($a,$b,$c,$d,$e)=@V;'.
				795	'&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
				796	'&xor (@T[0],$d) if($j==19);'.
				797	'&xor (@T[0],$c) if($j> 19)', # ($b^$d^$c)
				798	'&mov (@T[1],$a)', # $b for next round
				799
				800	'&$_rol ($a,5)',
				801	'&add ($e,@T[0])',
				802	'&xor (@T[1],$c) if ($j< 79)', # $b^$d for next round
				803
				804	'&$_ror ($b,7)', # $b>>>2
				805	'&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
				806	);
				807	}
				808
				809	sub body_40_59 () { # ((b^c)&(c^d))^c
				810	# on entry @T[0]=(b^c), (c^=d)
				811	$rx++;
				812	(
				813	'($a,$b,$c,$d,$e)=@V;'.
				814	'&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
				815	'&and (@T[0],$c) if ($j>=40)', # (b^c)&(c^d)
				816	'&xor ($c,$d) if ($j>=40)', # restore $c
				817
				818	'&$_ror ($b,7)', # $b>>>2
				819	'&mov (@T[1],$a)', # $b for next round
				820	'&xor (@T[0],$c)',
				821
				822	'&$_rol ($a,5)',
				823	'&add ($e,@T[0])',
				824	'&xor (@T[1],$c) if ($j==59);'.
				825	'&xor (@T[1],$b) if ($j< 59)', # b^c for next round
				826
				827	'&xor ($b,$c) if ($j< 59)', # c^d for next round
				828	'&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
				829	);
				830	}
				831	$code.=<<___;
				832	.align 16
				833	.Loop_ssse3:
				834	___
				835	&Xupdate_ssse3_16_31(\&body_00_19);
				836	&Xupdate_ssse3_16_31(\&body_00_19);
				837	&Xupdate_ssse3_16_31(\&body_00_19);
				838	&Xupdate_ssse3_16_31(\&body_00_19);
				839	&Xupdate_ssse3_32_79(\&body_00_19);
				840	&Xupdate_ssse3_32_79(\&body_20_39);
				841	&Xupdate_ssse3_32_79(\&body_20_39);
				842	&Xupdate_ssse3_32_79(\&body_20_39);
				843	&Xupdate_ssse3_32_79(\&body_20_39);
				844	&Xupdate_ssse3_32_79(\&body_20_39);
				845	&Xupdate_ssse3_32_79(\&body_40_59);
				846	&Xupdate_ssse3_32_79(\&body_40_59);
				847	&Xupdate_ssse3_32_79(\&body_40_59);
				848	&Xupdate_ssse3_32_79(\&body_40_59);
				849	&Xupdate_ssse3_32_79(\&body_40_59);
				850	&Xupdate_ssse3_32_79(\&body_20_39);
				851	&Xuplast_ssse3_80(\&body_20_39); # can jump to "done"
				852
				853	$saved_j=$j; @saved_V=@V;
				854
				855	&Xloop_ssse3(\&body_20_39);
				856	&Xloop_ssse3(\&body_20_39);
				857	&Xloop_ssse3(\&body_20_39);
				858
				859	$code.=<<___;
				860	add 0($ctx),$A # update context
				861	add 4($ctx),@T[0]
				862	add 8($ctx),$C
				863	add 12($ctx),$D
				864	mov $A,0($ctx)
				865	add 16($ctx),$E
				866	mov @T[0],4($ctx)
				867	mov @T[0],$B # magic seed
				868	mov $C,8($ctx)
				869	mov $C,@T[1]
				870	mov $D,12($ctx)
				871	xor $D,@T[1]
				872	mov $E,16($ctx)
				873	and @T[1],@T[0]
				874	jmp .Loop_ssse3
				875
				876	.align 16
				877	.Ldone_ssse3:
				878	___
				879	$j=$saved_j; @V=@saved_V;
				880
				881	&Xtail_ssse3(\&body_20_39);
				882	&Xtail_ssse3(\&body_20_39);
				883	&Xtail_ssse3(\&body_20_39);
				884
				885	$code.=<<___;
				886	add 0($ctx),$A # update context
				887	add 4($ctx),@T[0]
				888	add 8($ctx),$C
				889	mov $A,0($ctx)
				890	add 12($ctx),$D
				891	mov @T[0],4($ctx)
				892	add 16($ctx),$E
				893	mov $C,8($ctx)
				894	mov $D,12($ctx)
				895	mov $E,16($ctx)
				896	___
				897	$code.=<<___ if ($win64);
Adam Langley	cb1b333	2017-02-09 14:17:39 -0800	[diff] [blame]	898	movaps -40-6*16($fp),%xmm6
				899	movaps -40-5*16($fp),%xmm7
				900	movaps -40-4*16($fp),%xmm8
				901	movaps -40-3*16($fp),%xmm9
				902	movaps -40-2*16($fp),%xmm10
				903	movaps -40-1*16($fp),%xmm11
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	904	___
				905	$code.=<<___;
Adam Langley	cb1b333	2017-02-09 14:17:39 -0800	[diff] [blame]	906	mov -40($fp),%r14
				907	mov -32($fp),%r13
				908	mov -24($fp),%r12
				909	mov -16($fp),%rbp
				910	mov -8($fp),%rbx
				911	lea ($fp),%rsp
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	912	.Lepilogue_ssse3:
				913	ret
				914	.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
				915	___
				916
				917	if ($avx) {
				918	$Xi=4; # reset variables
				919	@X=map("%xmm$_",(4..7,0..3));
				920	@Tx=map("%xmm$_",(8..10));
				921	$j=0;
				922	$rx=0;
				923
				924	my $done_avx_label=".Ldone_avx";
				925
				926	my $_rol=sub { &shld(@_[0],@_) };
				927	my $_ror=sub { &shrd(@_[0],@_) };
				928
				929	$code.=<<___;
				930	.type sha1_block_data_order_avx,\@function,3
				931	.align 16
				932	sha1_block_data_order_avx:
				933	_avx_shortcut:
Adam Langley	cb1b333	2017-02-09 14:17:39 -0800	[diff] [blame]	934	mov %rsp,$fp
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	935	push %rbx
				936	push %rbp
				937	push %r12
				938	push %r13 # redundant, done to share Win64 SE handler
				939	push %r14
				940	lea `-64-($win64?6*16:0)`(%rsp),%rsp
				941	vzeroupper
				942	___
				943	$code.=<<___ if ($win64);
Adam Langley	cb1b333	2017-02-09 14:17:39 -0800	[diff] [blame]	944	vmovaps %xmm6,-40-6*16($fp)
				945	vmovaps %xmm7,-40-5*16($fp)
				946	vmovaps %xmm8,-40-4*16($fp)
				947	vmovaps %xmm9,-40-3*16($fp)
				948	vmovaps %xmm10,-40-2*16($fp)
				949	vmovaps %xmm11,-40-1*16($fp)
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	950	.Lprologue_avx:
				951	___
				952	$code.=<<___;
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	953	and \$-64,%rsp
				954	mov %rdi,$ctx # reassigned argument
				955	mov %rsi,$inp # reassigned argument
				956	mov %rdx,$num # reassigned argument
				957
				958	shl \$6,$num
				959	add $inp,$num
				960	lea K_XX_XX+64(%rip),$K_XX_XX
				961
				962	mov 0($ctx),$A # load context
				963	mov 4($ctx),$B
				964	mov 8($ctx),$C
				965	mov 12($ctx),$D
				966	mov $B,@T[0] # magic seed
				967	mov 16($ctx),$E
				968	mov $C,@T[1]
				969	xor $D,@T[1]
				970	and @T[1],@T[0]
				971
				972	vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
				973	vmovdqa -64($K_XX_XX),$Kx # K_00_19
				974	vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
				975	vmovdqu 16($inp),@X[-3&7]
				976	vmovdqu 32($inp),@X[-2&7]
				977	vmovdqu 48($inp),@X[-1&7]
				978	vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
				979	add \$64,$inp
				980	vpshufb @X[2],@X[-3&7],@X[-3&7]
				981	vpshufb @X[2],@X[-2&7],@X[-2&7]
				982	vpshufb @X[2],@X[-1&7],@X[-1&7]
				983	vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
				984	vpaddd $Kx,@X[-3&7],@X[1]
				985	vpaddd $Kx,@X[-2&7],@X[2]
				986	vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
				987	vmovdqa @X[1],16(%rsp)
				988	vmovdqa @X[2],32(%rsp)
				989	jmp .Loop_avx
				990	___
				991
				992	sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
				993	{ use integer;
				994	my $body = shift;
				995	my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
				996	my ($a,$b,$c,$d,$e);
				997
				998	eval(shift(@insns));
				999	eval(shift(@insns));
				1000	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
				1001	eval(shift(@insns));
				1002	eval(shift(@insns));
				1003
				1004	&vpaddd (@Tx[1],$Kx,@X[-1&7]);
				1005	eval(shift(@insns));
				1006	eval(shift(@insns));
				1007	&vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
				1008	eval(shift(@insns));
				1009	eval(shift(@insns));
				1010	&vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
				1011	eval(shift(@insns));
				1012	eval(shift(@insns));
				1013
				1014	&vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
				1015	eval(shift(@insns));
				1016	eval(shift(@insns));
				1017	eval(shift(@insns));
				1018	eval(shift(@insns));
				1019
				1020	&vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
				1021	eval(shift(@insns));
				1022	eval(shift(@insns));
				1023	&vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
				1024	eval(shift(@insns));
				1025	eval(shift(@insns));
				1026
				1027	&vpsrld (@Tx[0],@X[0],31);
				1028	eval(shift(@insns));
				1029	eval(shift(@insns));
				1030	eval(shift(@insns));
				1031	eval(shift(@insns));
				1032
				1033	&vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
				1034	&vpaddd (@X[0],@X[0],@X[0]);
				1035	eval(shift(@insns));
				1036	eval(shift(@insns));
				1037	eval(shift(@insns));
				1038	eval(shift(@insns));
				1039
				1040	&vpsrld (@Tx[1],@Tx[2],30);
				1041	&vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
				1042	eval(shift(@insns));
				1043	eval(shift(@insns));
				1044	eval(shift(@insns));
				1045	eval(shift(@insns));
				1046
				1047	&vpslld (@Tx[2],@Tx[2],2);
				1048	&vpxor (@X[0],@X[0],@Tx[1]);
				1049	eval(shift(@insns));
				1050	eval(shift(@insns));
				1051	eval(shift(@insns));
				1052	eval(shift(@insns));
				1053
				1054	&vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
				1055	eval(shift(@insns));
				1056	eval(shift(@insns));
				1057	&vmovdqa ($Kx,eval(216(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
				1058	eval(shift(@insns));
				1059	eval(shift(@insns));
				1060
				1061
				1062	foreach (@insns) { eval; } # remaining instructions [if any]
				1063
				1064	$Xi++; push(@X,shift(@X)); # "rotate" X[]
				1065	}
				1066
				1067	sub Xupdate_avx_32_79()
				1068	{ use integer;
				1069	my $body = shift;
				1070	my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
				1071	my ($a,$b,$c,$d,$e);
				1072
				1073	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
				1074	&vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
				1075	eval(shift(@insns)); # body_20_39
				1076	eval(shift(@insns));
				1077	eval(shift(@insns));
				1078	eval(shift(@insns)); # rol
				1079
				1080	&vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
				1081	eval(shift(@insns));
				1082	eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
				1083	&vpaddd (@Tx[1],$Kx,@X[-1&7]);
				1084	&vmovdqa ($Kx,eval(216($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0);
				1085	eval(shift(@insns)); # ror
				1086	eval(shift(@insns));
				1087
				1088	&vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
				1089	eval(shift(@insns)); # body_20_39
				1090	eval(shift(@insns));
				1091	eval(shift(@insns));
				1092	eval(shift(@insns)); # rol
				1093
				1094	&vpsrld (@Tx[0],@X[0],30);
				1095	&vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
				1096	eval(shift(@insns));
				1097	eval(shift(@insns));
				1098	eval(shift(@insns)); # ror
				1099	eval(shift(@insns));
				1100
				1101	&vpslld (@X[0],@X[0],2);
				1102	eval(shift(@insns)); # body_20_39
				1103	eval(shift(@insns));
				1104	eval(shift(@insns));
				1105	eval(shift(@insns)); # rol
				1106	eval(shift(@insns));
				1107	eval(shift(@insns));
				1108	eval(shift(@insns)); # ror
				1109	eval(shift(@insns));
				1110
				1111	&vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
				1112	eval(shift(@insns)); # body_20_39
				1113	eval(shift(@insns));
				1114	eval(shift(@insns));
				1115	eval(shift(@insns)); # rol
				1116	eval(shift(@insns));
				1117	eval(shift(@insns));
				1118	eval(shift(@insns)); # rol
				1119	eval(shift(@insns));
				1120
				1121	foreach (@insns) { eval; } # remaining instructions
				1122
				1123	$Xi++; push(@X,shift(@X)); # "rotate" X[]
				1124	}
				1125
				1126	sub Xuplast_avx_80()
				1127	{ use integer;
				1128	my $body = shift;
				1129	my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
				1130	my ($a,$b,$c,$d,$e);
				1131
				1132	eval(shift(@insns));
				1133	&vpaddd (@Tx[1],$Kx,@X[-1&7]);
				1134	eval(shift(@insns));
				1135	eval(shift(@insns));
				1136	eval(shift(@insns));
				1137	eval(shift(@insns));
				1138
				1139	&vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
				1140
				1141	foreach (@insns) { eval; } # remaining instructions
				1142
				1143	&cmp ($inp,$num);
				1144	&je ($done_avx_label);
				1145
				1146	&vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask
				1147	&vmovdqa($Kx,"-64($K_XX_XX)"); # K_00_19
				1148	&vmovdqu(@X[-4&7],"0($inp)"); # load input
				1149	&vmovdqu(@X[-3&7],"16($inp)");
				1150	&vmovdqu(@X[-2&7],"32($inp)");
				1151	&vmovdqu(@X[-1&7],"48($inp)");
				1152	&vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
				1153	&add ($inp,64);
				1154
				1155	$Xi=0;
				1156	}
				1157
				1158	sub Xloop_avx()
				1159	{ use integer;
				1160	my $body = shift;
				1161	my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
				1162	my ($a,$b,$c,$d,$e);
				1163
				1164	eval(shift(@insns));
				1165	eval(shift(@insns));
				1166	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
				1167	eval(shift(@insns));
				1168	eval(shift(@insns));
				1169	&vpaddd (@X[$Xi&7],@X[($Xi-4)&7],$Kx);
				1170	eval(shift(@insns));
				1171	eval(shift(@insns));
				1172	eval(shift(@insns));
				1173	eval(shift(@insns));
				1174	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU
				1175	eval(shift(@insns));
				1176	eval(shift(@insns));
				1177
				1178	foreach (@insns) { eval; }
				1179	$Xi++;
				1180	}
				1181
				1182	sub Xtail_avx()
				1183	{ use integer;
				1184	my $body = shift;
				1185	my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
				1186	my ($a,$b,$c,$d,$e);
				1187
				1188	foreach (@insns) { eval; }
				1189	}
				1190
				1191	$code.=<<___;
				1192	.align 16
				1193	.Loop_avx:
				1194	___
				1195	&Xupdate_avx_16_31(\&body_00_19);
				1196	&Xupdate_avx_16_31(\&body_00_19);
				1197	&Xupdate_avx_16_31(\&body_00_19);
				1198	&Xupdate_avx_16_31(\&body_00_19);
				1199	&Xupdate_avx_32_79(\&body_00_19);
				1200	&Xupdate_avx_32_79(\&body_20_39);
				1201	&Xupdate_avx_32_79(\&body_20_39);
				1202	&Xupdate_avx_32_79(\&body_20_39);
				1203	&Xupdate_avx_32_79(\&body_20_39);
				1204	&Xupdate_avx_32_79(\&body_20_39);
				1205	&Xupdate_avx_32_79(\&body_40_59);
				1206	&Xupdate_avx_32_79(\&body_40_59);
				1207	&Xupdate_avx_32_79(\&body_40_59);
				1208	&Xupdate_avx_32_79(\&body_40_59);
				1209	&Xupdate_avx_32_79(\&body_40_59);
				1210	&Xupdate_avx_32_79(\&body_20_39);
				1211	&Xuplast_avx_80(\&body_20_39); # can jump to "done"
				1212
				1213	$saved_j=$j; @saved_V=@V;
				1214
				1215	&Xloop_avx(\&body_20_39);
				1216	&Xloop_avx(\&body_20_39);
				1217	&Xloop_avx(\&body_20_39);
				1218
				1219	$code.=<<___;
				1220	add 0($ctx),$A # update context
				1221	add 4($ctx),@T[0]
				1222	add 8($ctx),$C
				1223	add 12($ctx),$D
				1224	mov $A,0($ctx)
				1225	add 16($ctx),$E
				1226	mov @T[0],4($ctx)
				1227	mov @T[0],$B # magic seed
				1228	mov $C,8($ctx)
				1229	mov $C,@T[1]
				1230	mov $D,12($ctx)
				1231	xor $D,@T[1]
				1232	mov $E,16($ctx)
				1233	and @T[1],@T[0]
				1234	jmp .Loop_avx
				1235
				1236	.align 16
				1237	$done_avx_label:
				1238	___
				1239	$j=$saved_j; @V=@saved_V;
				1240
				1241	&Xtail_avx(\&body_20_39);
				1242	&Xtail_avx(\&body_20_39);
				1243	&Xtail_avx(\&body_20_39);
				1244
				1245	$code.=<<___;
				1246	vzeroupper
				1247
				1248	add 0($ctx),$A # update context
				1249	add 4($ctx),@T[0]
				1250	add 8($ctx),$C
				1251	mov $A,0($ctx)
				1252	add 12($ctx),$D
				1253	mov @T[0],4($ctx)
				1254	add 16($ctx),$E
				1255	mov $C,8($ctx)
				1256	mov $D,12($ctx)
				1257	mov $E,16($ctx)
				1258	___
				1259	$code.=<<___ if ($win64);
Adam Langley	cb1b333	2017-02-09 14:17:39 -0800	[diff] [blame]	1260	movaps -40-6*16($fp),%xmm6
				1261	movaps -40-5*16($fp),%xmm7
				1262	movaps -40-4*16($fp),%xmm8
				1263	movaps -40-3*16($fp),%xmm9
				1264	movaps -40-2*16($fp),%xmm10
				1265	movaps -40-1*16($fp),%xmm11
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	1266	___
				1267	$code.=<<___;
Adam Langley	cb1b333	2017-02-09 14:17:39 -0800	[diff] [blame]	1268	mov -40($fp),%r14
				1269	mov -32($fp),%r13
				1270	mov -24($fp),%r12
				1271	mov -16($fp),%rbp
				1272	mov -8($fp),%rbx
				1273	lea ($fp),%rsp
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	1274	.Lepilogue_avx:
				1275	ret
				1276	.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
				1277	___
				1278
				1279	if ($avx>1) {
				1280	use integer;
				1281	$Xi=4; # reset variables
				1282	@X=map("%ymm$_",(4..7,0..3));
				1283	@Tx=map("%ymm$_",(8..10));
				1284	$Kx="%ymm11";
				1285	$j=0;
				1286
				1287	my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi");
				1288	my ($a5,$t0)=("%r12d","%edi");
				1289
				1290	my ($A,$F,$B,$C,$D,$E)=@ROTX;
				1291	my $rx=0;
				1292	my $frame="%r13";
				1293
				1294	$code.=<<___;
				1295	.type sha1_block_data_order_avx2,\@function,3
				1296	.align 16
				1297	sha1_block_data_order_avx2:
				1298	_avx2_shortcut:
Adam Langley	cb1b333	2017-02-09 14:17:39 -0800	[diff] [blame]	1299	mov %rsp,$fp
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	1300	push %rbx
				1301	push %rbp
				1302	push %r12
				1303	push %r13
				1304	push %r14
				1305	vzeroupper
				1306	___
				1307	$code.=<<___ if ($win64);
				1308	lea -6*16(%rsp),%rsp
Adam Langley	cb1b333	2017-02-09 14:17:39 -0800	[diff] [blame]	1309	vmovaps %xmm6,-40-6*16($fp)
				1310	vmovaps %xmm7,-40-5*16($fp)
				1311	vmovaps %xmm8,-40-4*16($fp)
				1312	vmovaps %xmm9,-40-3*16($fp)
				1313	vmovaps %xmm10,-40-2*16($fp)
				1314	vmovaps %xmm11,-40-1*16($fp)
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	1315	.Lprologue_avx2:
				1316	___
				1317	$code.=<<___;
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	1318	mov %rdi,$ctx # reassigned argument
				1319	mov %rsi,$inp # reassigned argument
				1320	mov %rdx,$num # reassigned argument
				1321
				1322	lea -640(%rsp),%rsp
				1323	shl \$6,$num
				1324	lea 64($inp),$frame
				1325	and \$-128,%rsp
				1326	add $inp,$num
				1327	lea K_XX_XX+64(%rip),$K_XX_XX
				1328
				1329	mov 0($ctx),$A # load context
				1330	cmp $num,$frame
				1331	cmovae $inp,$frame # next or same block
				1332	mov 4($ctx),$F
				1333	mov 8($ctx),$C
				1334	mov 12($ctx),$D
				1335	mov 16($ctx),$E
				1336	vmovdqu 64($K_XX_XX),@X[2] # pbswap mask
				1337
				1338	vmovdqu ($inp),%xmm0
				1339	vmovdqu 16($inp),%xmm1
				1340	vmovdqu 32($inp),%xmm2
				1341	vmovdqu 48($inp),%xmm3
				1342	lea 64($inp),$inp
				1343	vinserti128 \$1,($frame),@X[-4&7],@X[-4&7]
				1344	vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7]
				1345	vpshufb @X[2],@X[-4&7],@X[-4&7]
				1346	vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7]
				1347	vpshufb @X[2],@X[-3&7],@X[-3&7]
				1348	vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7]
				1349	vpshufb @X[2],@X[-2&7],@X[-2&7]
				1350	vmovdqu -64($K_XX_XX),$Kx # K_00_19
				1351	vpshufb @X[2],@X[-1&7],@X[-1&7]
				1352
				1353	vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
				1354	vpaddd $Kx,@X[-3&7],@X[1]
				1355	vmovdqu @X[0],0(%rsp) # X[]+K xfer to IALU
				1356	vpaddd $Kx,@X[-2&7],@X[2]
				1357	vmovdqu @X[1],32(%rsp)
				1358	vpaddd $Kx,@X[-1&7],@X[3]
				1359	vmovdqu @X[2],64(%rsp)
				1360	vmovdqu @X[3],96(%rsp)
				1361	___
				1362	for (;$Xi<8;$Xi++) { # Xupdate_avx2_16_31
				1363	use integer;
				1364
				1365	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
				1366	&vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
				1367	&vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
				1368	&vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
				1369	&vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
				1370	&vpsrld (@Tx[0],@X[0],31);
				1371	&vmovdqu($Kx,eval(216(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
				1372	&vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
				1373	&vpaddd (@X[0],@X[0],@X[0]);
				1374	&vpsrld (@Tx[1],@Tx[2],30);
				1375	&vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
				1376	&vpslld (@Tx[2],@Tx[2],2);
				1377	&vpxor (@X[0],@X[0],@Tx[1]);
				1378	&vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
				1379	&vpaddd (@Tx[1],@X[0],$Kx);
				1380	&vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU
				1381
				1382	push(@X,shift(@X)); # "rotate" X[]
				1383	}
				1384	$code.=<<___;
				1385	lea 128(%rsp),$frame
				1386	jmp .Loop_avx2
				1387	.align 32
				1388	.Loop_avx2:
				1389	rorx \$2,$F,$B
				1390	andn $D,$F,$t0
				1391	and $C,$F
				1392	xor $t0,$F
				1393	___
				1394	sub bodyx_00_19 () { # 8 instructions, 3 cycles critical path
				1395	# at start $f=(b&c)^(~b&d), $b>>>=2
				1396	return &bodyx_20_39() if ($rx==19); $rx++;
				1397	(
				1398	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
				1399
				1400	'&add ($e,((32($j/4)+4($j%4))%256-128)."($frame)");'. # e+=X[i]+K
				1401	'&lea ($frame,"256($frame)") if ($j%32==31);',
				1402	'&andn ($t0,$a,$c)', # ~b&d for next round
				1403
				1404	'&add ($e,$f)', # e+=(b&c)^(~b&d)
				1405	'&rorx ($a5,$a,27)', # a<<<5
				1406	'&rorx ($f,$a,2)', # b>>>2 for next round
				1407	'&and ($a,$b)', # b&c for next round
				1408
				1409	'&add ($e,$a5)', # e+=a<<<5
				1410	'&xor ($a,$t0);'. # f=(b&c)^(~b&d) for next round
				1411
				1412	'unshift(@ROTX,pop(@ROTX)); $j++;'
				1413	)
				1414	}
				1415
				1416	sub bodyx_20_39 () { # 7 instructions, 2 cycles critical path
				1417	# on entry $f=b^c^d, $b>>>=2
				1418	return &bodyx_40_59() if ($rx==39); $rx++;
				1419	(
				1420	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
				1421
				1422	'&add ($e,((32($j/4)+4($j%4))%256-128)."($frame)");'. # e+=X[i]+K
				1423	'&lea ($frame,"256($frame)") if ($j%32==31);',
				1424
				1425	'&lea ($e,"($e,$f)")', # e+=b^c^d
				1426	'&rorx ($a5,$a,27)', # a<<<5
				1427	'&rorx ($f,$a,2) if ($j<79)', # b>>>2 in next round
				1428	'&xor ($a,$b) if ($j<79)', # b^c for next round
				1429
				1430	'&add ($e,$a5)', # e+=a<<<5
				1431	'&xor ($a,$c) if ($j<79);'. # f=b^c^d for next round
				1432
				1433	'unshift(@ROTX,pop(@ROTX)); $j++;'
				1434	)
				1435	}
				1436
				1437	sub bodyx_40_59 () { # 10 instructions, 3 cycles critical path
				1438	# on entry $f=((b^c)&(c^d)), $b>>>=2
				1439	$rx++;
				1440	(
				1441	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
				1442
				1443	'&add ($e,((32($j/4)+4($j%4))%256-128)."($frame)");'. # e+=X[i]+K
				1444	'&lea ($frame,"256($frame)") if ($j%32==31);',
				1445	'&xor ($f,$c) if ($j>39)', # (b^c)&(c^d)^c
				1446	'&mov ($t0,$b) if ($j<59)', # count on zero latency
				1447	'&xor ($t0,$c) if ($j<59)', # c^d for next round
				1448
				1449	'&lea ($e,"($e,$f)")', # e+=(b^c)&(c^d)^c
				1450	'&rorx ($a5,$a,27)', # a<<<5
				1451	'&rorx ($f,$a,2)', # b>>>2 in next round
				1452	'&xor ($a,$b)', # b^c for next round
				1453
				1454	'&add ($e,$a5)', # e+=a<<<5
				1455	'&and ($a,$t0) if ($j< 59);'. # f=(b^c)&(c^d) for next round
				1456	'&xor ($a,$c) if ($j==59);'. # f=b^c^d for next round
				1457
				1458	'unshift(@ROTX,pop(@ROTX)); $j++;'
				1459	)
				1460	}
				1461
				1462	sub Xupdate_avx2_16_31() # recall that $Xi starts wtih 4
				1463	{ use integer;
				1464	my $body = shift;
				1465	my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 instructions
				1466	my ($a,$b,$c,$d,$e);
				1467
				1468	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
				1469	eval(shift(@insns));
				1470	eval(shift(@insns));
				1471	eval(shift(@insns));
				1472	eval(shift(@insns));
				1473
				1474	&vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
				1475	eval(shift(@insns));
				1476	eval(shift(@insns));
				1477	eval(shift(@insns));
				1478
				1479	&vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
				1480	&vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
				1481	eval(shift(@insns));
				1482	eval(shift(@insns));
				1483
				1484	&vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
				1485	eval(shift(@insns));
				1486	eval(shift(@insns));
				1487	eval(shift(@insns));
				1488	eval(shift(@insns));
				1489
				1490	&vpsrld (@Tx[0],@X[0],31);
				1491	&vmovdqu($Kx,eval(216(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
				1492	eval(shift(@insns));
				1493	eval(shift(@insns));
				1494	eval(shift(@insns));
				1495
				1496	&vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
				1497	&vpaddd (@X[0],@X[0],@X[0]);
				1498	eval(shift(@insns));
				1499	eval(shift(@insns));
				1500
				1501	&vpsrld (@Tx[1],@Tx[2],30);
				1502	&vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
				1503	eval(shift(@insns));
				1504	eval(shift(@insns));
				1505
				1506	&vpslld (@Tx[2],@Tx[2],2);
				1507	&vpxor (@X[0],@X[0],@Tx[1]);
				1508	eval(shift(@insns));
				1509	eval(shift(@insns));
				1510
				1511	&vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
				1512	eval(shift(@insns));
				1513	eval(shift(@insns));
				1514	eval(shift(@insns));
				1515
				1516	&vpaddd (@Tx[1],@X[0],$Kx);
				1517	eval(shift(@insns));
				1518	eval(shift(@insns));
				1519	eval(shift(@insns));
				1520	&vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
				1521
				1522	foreach (@insns) { eval; } # remaining instructions [if any]
				1523
				1524	$Xi++;
				1525	push(@X,shift(@X)); # "rotate" X[]
				1526	}
				1527
				1528	sub Xupdate_avx2_32_79()
				1529	{ use integer;
				1530	my $body = shift;
				1531	my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 to 50 instructions
				1532	my ($a,$b,$c,$d,$e);
				1533
				1534	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
				1535	&vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
				1536	eval(shift(@insns));
				1537	eval(shift(@insns));
				1538
				1539	&vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
				1540	&vmovdqu($Kx,eval(216($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0);
				1541	eval(shift(@insns));
				1542	eval(shift(@insns));
				1543	eval(shift(@insns));
				1544
				1545	&vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
				1546	eval(shift(@insns));
				1547	eval(shift(@insns));
				1548	eval(shift(@insns));
				1549
				1550	&vpsrld (@Tx[0],@X[0],30);
				1551	&vpslld (@X[0],@X[0],2);
				1552	eval(shift(@insns));
				1553	eval(shift(@insns));
				1554	eval(shift(@insns));
				1555
				1556	#&vpslld (@X[0],@X[0],2);
				1557	eval(shift(@insns));
				1558	eval(shift(@insns));
				1559	eval(shift(@insns));
				1560
				1561	&vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
				1562	eval(shift(@insns));
				1563	eval(shift(@insns));
				1564	eval(shift(@insns));
				1565	eval(shift(@insns));
				1566
				1567	&vpaddd (@Tx[1],@X[0],$Kx);
				1568	eval(shift(@insns));
				1569	eval(shift(@insns));
				1570	eval(shift(@insns));
				1571	eval(shift(@insns));
				1572
				1573	&vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU
				1574
				1575	foreach (@insns) { eval; } # remaining instructions
				1576
				1577	$Xi++;
				1578	push(@X,shift(@X)); # "rotate" X[]
				1579	}
				1580
				1581	sub Xloop_avx2()
				1582	{ use integer;
				1583	my $body = shift;
				1584	my @insns = (&$body,&$body,&$body,&$body,&$body); # 32 instructions
				1585	my ($a,$b,$c,$d,$e);
				1586
				1587	foreach (@insns) { eval; }
				1588	}
				1589
				1590	&align32();
				1591	&Xupdate_avx2_32_79(\&bodyx_00_19);
				1592	&Xupdate_avx2_32_79(\&bodyx_00_19);
				1593	&Xupdate_avx2_32_79(\&bodyx_00_19);
				1594	&Xupdate_avx2_32_79(\&bodyx_00_19);
				1595
				1596	&Xupdate_avx2_32_79(\&bodyx_20_39);
				1597	&Xupdate_avx2_32_79(\&bodyx_20_39);
				1598	&Xupdate_avx2_32_79(\&bodyx_20_39);
				1599	&Xupdate_avx2_32_79(\&bodyx_20_39);
				1600
				1601	&align32();
				1602	&Xupdate_avx2_32_79(\&bodyx_40_59);
				1603	&Xupdate_avx2_32_79(\&bodyx_40_59);
				1604	&Xupdate_avx2_32_79(\&bodyx_40_59);
				1605	&Xupdate_avx2_32_79(\&bodyx_40_59);
				1606
				1607	&Xloop_avx2(\&bodyx_20_39);
				1608	&Xloop_avx2(\&bodyx_20_39);
				1609	&Xloop_avx2(\&bodyx_20_39);
				1610	&Xloop_avx2(\&bodyx_20_39);
				1611
				1612	$code.=<<___;
				1613	lea 128($inp),$frame
				1614	lea 128($inp),%rdi # borrow $t0
				1615	cmp $num,$frame
				1616	cmovae $inp,$frame # next or previous block
				1617
				1618	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
				1619	add 0($ctx),@ROTX[0] # update context
				1620	add 4($ctx),@ROTX[1]
				1621	add 8($ctx),@ROTX[3]
				1622	mov @ROTX[0],0($ctx)
				1623	add 12($ctx),@ROTX[4]
				1624	mov @ROTX[1],4($ctx)
				1625	mov @ROTX[0],$A # A=d
				1626	add 16($ctx),@ROTX[5]
				1627	mov @ROTX[3],$a5
				1628	mov @ROTX[3],8($ctx)
				1629	mov @ROTX[4],$D # D=b
				1630	#xchg @ROTX[5],$F # F=c, C=f
				1631	mov @ROTX[4],12($ctx)
				1632	mov @ROTX[1],$F # F=e
				1633	mov @ROTX[5],16($ctx)
				1634	#mov $F,16($ctx)
				1635	mov @ROTX[5],$E # E=c
				1636	mov $a5,$C # C=f
				1637	#xchg $F,$E # E=c, F=e
				1638
				1639	cmp $num,$inp
				1640	je .Ldone_avx2
				1641	___
				1642
				1643	$Xi=4; # reset variables
				1644	@X=map("%ymm$_",(4..7,0..3));
				1645
				1646	$code.=<<___;
				1647	vmovdqu 64($K_XX_XX),@X[2] # pbswap mask
				1648	cmp $num,%rdi # borrowed $t0
				1649	ja .Last_avx2
				1650
				1651	vmovdqu -64(%rdi),%xmm0 # low part of @X[-4&7]
				1652	vmovdqu -48(%rdi),%xmm1
				1653	vmovdqu -32(%rdi),%xmm2
				1654	vmovdqu -16(%rdi),%xmm3
				1655	vinserti128 \$1,0($frame),@X[-4&7],@X[-4&7]
				1656	vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7]
				1657	vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7]
				1658	vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7]
				1659	jmp .Last_avx2
				1660
				1661	.align 32
				1662	.Last_avx2:
				1663	lea 128+16(%rsp),$frame
				1664	rorx \$2,$F,$B
				1665	andn $D,$F,$t0
				1666	and $C,$F
				1667	xor $t0,$F
				1668	sub \$-128,$inp
				1669	___
				1670	$rx=$j=0; @ROTX=($A,$F,$B,$C,$D,$E);
				1671
				1672	&Xloop_avx2 (\&bodyx_00_19);
				1673	&Xloop_avx2 (\&bodyx_00_19);
				1674	&Xloop_avx2 (\&bodyx_00_19);
				1675	&Xloop_avx2 (\&bodyx_00_19);
				1676
				1677	&Xloop_avx2 (\&bodyx_20_39);
				1678	&vmovdqu ($Kx,"-64($K_XX_XX)"); # K_00_19
				1679	&vpshufb (@X[-4&7],@X[-4&7],@X[2]); # byte swap
				1680	&Xloop_avx2 (\&bodyx_20_39);
				1681	&vpshufb (@X[-3&7],@X[-3&7],@X[2]);
				1682	&vpaddd (@Tx[0],@X[-4&7],$Kx); # add K_00_19
				1683	&Xloop_avx2 (\&bodyx_20_39);
				1684	&vmovdqu ("0(%rsp)",@Tx[0]);
				1685	&vpshufb (@X[-2&7],@X[-2&7],@X[2]);
				1686	&vpaddd (@Tx[1],@X[-3&7],$Kx);
				1687	&Xloop_avx2 (\&bodyx_20_39);
				1688	&vmovdqu ("32(%rsp)",@Tx[1]);
				1689	&vpshufb (@X[-1&7],@X[-1&7],@X[2]);
				1690	&vpaddd (@X[2],@X[-2&7],$Kx);
				1691
				1692	&Xloop_avx2 (\&bodyx_40_59);
				1693	&align32 ();
				1694	&vmovdqu ("64(%rsp)",@X[2]);
				1695	&vpaddd (@X[3],@X[-1&7],$Kx);
				1696	&Xloop_avx2 (\&bodyx_40_59);
				1697	&vmovdqu ("96(%rsp)",@X[3]);
				1698	&Xloop_avx2 (\&bodyx_40_59);
				1699	&Xupdate_avx2_16_31(\&bodyx_40_59);
				1700
				1701	&Xupdate_avx2_16_31(\&bodyx_20_39);
				1702	&Xupdate_avx2_16_31(\&bodyx_20_39);
				1703	&Xupdate_avx2_16_31(\&bodyx_20_39);
				1704	&Xloop_avx2 (\&bodyx_20_39);
				1705
				1706	$code.=<<___;
				1707	lea 128(%rsp),$frame
				1708
				1709	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
				1710	add 0($ctx),@ROTX[0] # update context
				1711	add 4($ctx),@ROTX[1]
				1712	add 8($ctx),@ROTX[3]
				1713	mov @ROTX[0],0($ctx)
				1714	add 12($ctx),@ROTX[4]
				1715	mov @ROTX[1],4($ctx)
				1716	mov @ROTX[0],$A # A=d
				1717	add 16($ctx),@ROTX[5]
				1718	mov @ROTX[3],$a5
				1719	mov @ROTX[3],8($ctx)
				1720	mov @ROTX[4],$D # D=b
				1721	#xchg @ROTX[5],$F # F=c, C=f
				1722	mov @ROTX[4],12($ctx)
				1723	mov @ROTX[1],$F # F=e
				1724	mov @ROTX[5],16($ctx)
				1725	#mov $F,16($ctx)
				1726	mov @ROTX[5],$E # E=c
				1727	mov $a5,$C # C=f
				1728	#xchg $F,$E # E=c, F=e
				1729
				1730	cmp $num,$inp
				1731	jbe .Loop_avx2
				1732
				1733	.Ldone_avx2:
				1734	vzeroupper
				1735	___
				1736	$code.=<<___ if ($win64);
Adam Langley	cb1b333	2017-02-09 14:17:39 -0800	[diff] [blame]	1737	movaps -40-6*16($fp),%xmm6
				1738	movaps -40-5*16($fp),%xmm7
				1739	movaps -40-4*16($fp),%xmm8
				1740	movaps -40-3*16($fp),%xmm9
				1741	movaps -40-2*16($fp),%xmm10
				1742	movaps -40-1*16($fp),%xmm11
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	1743	___
				1744	$code.=<<___;
Adam Langley	cb1b333	2017-02-09 14:17:39 -0800	[diff] [blame]	1745	mov -40($fp),%r14
				1746	mov -32($fp),%r13
				1747	mov -24($fp),%r12
				1748	mov -16($fp),%rbp
				1749	mov -8($fp),%rbx
				1750	lea ($fp),%rsp
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	1751	.Lepilogue_avx2:
				1752	ret
				1753	.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
				1754	___
				1755	}
				1756	}
				1757	$code.=<<___;
				1758	.align 64
				1759	K_XX_XX:
				1760	.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
				1761	.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
				1762	.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
				1763	.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
				1764	.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
				1765	.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
				1766	.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
				1767	.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
				1768	.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
				1769	.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
Adam Langley	cb5dd63	2014-06-20 12:00:00 -0700	[diff] [blame]	1770	.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	1771	___
				1772	}}}
				1773	$code.=<<___;
				1774	.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
				1775	.align 64
				1776	___
				1777
				1778	# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
				1779	# CONTEXT context,DISPATCHER_CONTEXT disp)
				1780	if ($win64) {
				1781	$rec="%rcx";
				1782	$frame="%rdx";
				1783	$context="%r8";
				1784	$disp="%r9";
				1785
				1786	$code.=<<___;
				1787	.extern __imp_RtlVirtualUnwind
				1788	.type se_handler,\@abi-omnipotent
				1789	.align 16
				1790	se_handler:
				1791	push %rsi
				1792	push %rdi
				1793	push %rbx
				1794	push %rbp
				1795	push %r12
				1796	push %r13
				1797	push %r14
				1798	push %r15
				1799	pushfq
				1800	sub \$64,%rsp
				1801
				1802	mov 120($context),%rax # pull context->Rax
				1803	mov 248($context),%rbx # pull context->Rip
				1804
				1805	lea .Lprologue(%rip),%r10
				1806	cmp %r10,%rbx # context->Rip<.Lprologue
				1807	jb .Lcommon_seh_tail
				1808
				1809	mov 152($context),%rax # pull context->Rsp
				1810
				1811	lea .Lepilogue(%rip),%r10
				1812	cmp %r10,%rbx # context->Rip>=.Lepilogue
				1813	jae .Lcommon_seh_tail
				1814
				1815	mov `16*4`(%rax),%rax # pull saved stack pointer
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	1816
				1817	mov -8(%rax),%rbx
				1818	mov -16(%rax),%rbp
				1819	mov -24(%rax),%r12
				1820	mov -32(%rax),%r13
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	1821	mov -40(%rax),%r14
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	1822	mov %rbx,144($context) # restore context->Rbx
				1823	mov %rbp,160($context) # restore context->Rbp
				1824	mov %r12,216($context) # restore context->R12
				1825	mov %r13,224($context) # restore context->R13
Adam Langley	5c6ca97	2014-06-20 12:00:00 -0700	[diff] [blame]	1826	mov %r14,232($context) # restore context->R14
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	1827
				1828	jmp .Lcommon_seh_tail
				1829	.size se_handler,.-se_handler
Adam Langley	3ffd70e	2014-06-20 12:00:00 -0700	[diff] [blame]	1830	___
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	1831
Adam Langley	3ffd70e	2014-06-20 12:00:00 -0700	[diff] [blame]	1832	$code.=<<___ if ($shaext);
Adam Langley	cb5dd63	2014-06-20 12:00:00 -0700	[diff] [blame]	1833	.type shaext_handler,\@abi-omnipotent
				1834	.align 16
				1835	shaext_handler:
				1836	push %rsi
				1837	push %rdi
				1838	push %rbx
				1839	push %rbp
				1840	push %r12
				1841	push %r13
				1842	push %r14
				1843	push %r15
				1844	pushfq
				1845	sub \$64,%rsp
				1846
				1847	mov 120($context),%rax # pull context->Rax
				1848	mov 248($context),%rbx # pull context->Rip
				1849
				1850	lea .Lprologue_shaext(%rip),%r10
				1851	cmp %r10,%rbx # context->Rip<.Lprologue
				1852	jb .Lcommon_seh_tail
				1853
				1854	lea .Lepilogue_shaext(%rip),%r10
				1855	cmp %r10,%rbx # context->Rip>=.Lepilogue
				1856	jae .Lcommon_seh_tail
				1857
				1858	lea -8-4*16(%rax),%rsi
				1859	lea 512($context),%rdi # &context.Xmm6
				1860	mov \$8,%ecx
				1861	.long 0xa548f3fc # cld; rep movsq
				1862
				1863	jmp .Lcommon_seh_tail
				1864	.size shaext_handler,.-shaext_handler
Adam Langley	3ffd70e	2014-06-20 12:00:00 -0700	[diff] [blame]	1865	___
Adam Langley	cb5dd63	2014-06-20 12:00:00 -0700	[diff] [blame]	1866
Adam Langley	3ffd70e	2014-06-20 12:00:00 -0700	[diff] [blame]	1867	$code.=<<___;
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	1868	.type ssse3_handler,\@abi-omnipotent
				1869	.align 16
				1870	ssse3_handler:
				1871	push %rsi
				1872	push %rdi
				1873	push %rbx
				1874	push %rbp
				1875	push %r12
				1876	push %r13
				1877	push %r14
				1878	push %r15
				1879	pushfq
				1880	sub \$64,%rsp
				1881
				1882	mov 120($context),%rax # pull context->Rax
				1883	mov 248($context),%rbx # pull context->Rip
				1884
				1885	mov 8($disp),%rsi # disp->ImageBase
				1886	mov 56($disp),%r11 # disp->HandlerData
				1887
				1888	mov 0(%r11),%r10d # HandlerData[0]
				1889	lea (%rsi,%r10),%r10 # prologue label
				1890	cmp %r10,%rbx # context->Rip<prologue label
				1891	jb .Lcommon_seh_tail
				1892
Adam Langley	cb1b333	2017-02-09 14:17:39 -0800	[diff] [blame]	1893	mov 208($context),%rax # pull context->R11
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	1894
				1895	mov 4(%r11),%r10d # HandlerData[1]
				1896	lea (%rsi,%r10),%r10 # epilogue label
				1897	cmp %r10,%rbx # context->Rip>=epilogue label
				1898	jae .Lcommon_seh_tail
				1899
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	1900	lea -40-6*16(%rax),%rsi
				1901	lea 512($context),%rdi # &context.Xmm6
				1902	mov \$12,%ecx
				1903	.long 0xa548f3fc # cld; rep movsq
				1904
				1905	mov -8(%rax),%rbx
				1906	mov -16(%rax),%rbp
				1907	mov -24(%rax),%r12
				1908	mov -32(%rax),%r13
				1909	mov -40(%rax),%r14
				1910	mov %rbx,144($context) # restore context->Rbx
				1911	mov %rbp,160($context) # restore context->Rbp
				1912	mov %r12,216($context) # restore cotnext->R12
				1913	mov %r13,224($context) # restore cotnext->R13
				1914	mov %r14,232($context) # restore cotnext->R14
				1915
				1916	.Lcommon_seh_tail:
				1917	mov 8(%rax),%rdi
				1918	mov 16(%rax),%rsi
				1919	mov %rax,152($context) # restore context->Rsp
				1920	mov %rsi,168($context) # restore context->Rsi
				1921	mov %rdi,176($context) # restore context->Rdi
				1922
				1923	mov 40($disp),%rdi # disp->ContextRecord
				1924	mov $context,%rsi # context
				1925	mov \$154,%ecx # sizeof(CONTEXT)
				1926	.long 0xa548f3fc # cld; rep movsq
				1927
				1928	mov $disp,%rsi
				1929	xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
				1930	mov 8(%rsi),%rdx # arg2, disp->ImageBase
				1931	mov 0(%rsi),%r8 # arg3, disp->ControlPc
				1932	mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
				1933	mov 40(%rsi),%r10 # disp->ContextRecord
				1934	lea 56(%rsi),%r11 # &disp->HandlerData
				1935	lea 24(%rsi),%r12 # &disp->EstablisherFrame
				1936	mov %r10,32(%rsp) # arg5
				1937	mov %r11,40(%rsp) # arg6
				1938	mov %r12,48(%rsp) # arg7
				1939	mov %rcx,56(%rsp) # arg8, (NULL)
				1940	call *__imp_RtlVirtualUnwind(%rip)
				1941
				1942	mov \$1,%eax # ExceptionContinueSearch
				1943	add \$64,%rsp
				1944	popfq
				1945	pop %r15
				1946	pop %r14
				1947	pop %r13
				1948	pop %r12
				1949	pop %rbp
				1950	pop %rbx
				1951	pop %rdi
				1952	pop %rsi
				1953	ret
				1954	.size ssse3_handler,.-ssse3_handler
				1955
				1956	.section .pdata
				1957	.align 4
				1958	.rva .LSEH_begin_sha1_block_data_order
				1959	.rva .LSEH_end_sha1_block_data_order
				1960	.rva .LSEH_info_sha1_block_data_order
Adam Langley	006779a	2014-06-20 12:00:00 -0700	[diff] [blame]	1961	___
				1962	$code.=<<___ if ($shaext);
Adam Langley	cb5dd63	2014-06-20 12:00:00 -0700	[diff] [blame]	1963	.rva .LSEH_begin_sha1_block_data_order_shaext
				1964	.rva .LSEH_end_sha1_block_data_order_shaext
				1965	.rva .LSEH_info_sha1_block_data_order_shaext
Adam Langley	006779a	2014-06-20 12:00:00 -0700	[diff] [blame]	1966	___
				1967	$code.=<<___;
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	1968	.rva .LSEH_begin_sha1_block_data_order_ssse3
				1969	.rva .LSEH_end_sha1_block_data_order_ssse3
				1970	.rva .LSEH_info_sha1_block_data_order_ssse3
				1971	___
				1972	$code.=<<___ if ($avx);
				1973	.rva .LSEH_begin_sha1_block_data_order_avx
				1974	.rva .LSEH_end_sha1_block_data_order_avx
				1975	.rva .LSEH_info_sha1_block_data_order_avx
				1976	___
				1977	$code.=<<___ if ($avx>1);
				1978	.rva .LSEH_begin_sha1_block_data_order_avx2
				1979	.rva .LSEH_end_sha1_block_data_order_avx2
				1980	.rva .LSEH_info_sha1_block_data_order_avx2
				1981	___
				1982	$code.=<<___;
				1983	.section .xdata
				1984	.align 8
				1985	.LSEH_info_sha1_block_data_order:
				1986	.byte 9,0,0,0
				1987	.rva se_handler
Adam Langley	3ffd70e	2014-06-20 12:00:00 -0700	[diff] [blame]	1988	___
				1989	$code.=<<___ if ($shaext);
Adam Langley	cb5dd63	2014-06-20 12:00:00 -0700	[diff] [blame]	1990	.LSEH_info_sha1_block_data_order_shaext:
				1991	.byte 9,0,0,0
				1992	.rva shaext_handler
Adam Langley	3ffd70e	2014-06-20 12:00:00 -0700	[diff] [blame]	1993	___
				1994	$code.=<<___;
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	1995	.LSEH_info_sha1_block_data_order_ssse3:
				1996	.byte 9,0,0,0
				1997	.rva ssse3_handler
				1998	.rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
				1999	___
				2000	$code.=<<___ if ($avx);
				2001	.LSEH_info_sha1_block_data_order_avx:
				2002	.byte 9,0,0,0
				2003	.rva ssse3_handler
				2004	.rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
				2005	___
				2006	$code.=<<___ if ($avx>1);
				2007	.LSEH_info_sha1_block_data_order_avx2:
				2008	.byte 9,0,0,0
				2009	.rva ssse3_handler
				2010	.rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
				2011	___
				2012	}
				2013
				2014	####################################################################
				2015
Adam Langley	cb5dd63	2014-06-20 12:00:00 -0700	[diff] [blame]	2016	sub sha1rnds4 {
				2017	if (@_[0] =~ /\$([x0-9a-f]+),\s%xmm([0-7]),\s%xmm([0-7])/) {
				2018	my @opcode=(0x0f,0x3a,0xcc);
				2019	push @opcode,0xc0\|($2&7)\|(($3&7)<<3); # ModR/M
				2020	my $c=$1;
				2021	push @opcode,$c=~/^0/?oct($c):$c;
				2022	return ".byte\t".join(',',@opcode);
				2023	} else {
				2024	return "sha1rnds4\t".@_[0];
				2025	}
				2026	}
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	2027
Adam Langley	cb5dd63	2014-06-20 12:00:00 -0700	[diff] [blame]	2028	sub sha1op38 {
				2029	my $instr = shift;
				2030	my %opcodelet = (
				2031	"sha1nexte" => 0xc8,
				2032	"sha1msg1" => 0xc9,
				2033	"sha1msg2" => 0xca );
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	2034
Adam Langley	956665b	2014-06-20 12:00:00 -0700	[diff] [blame]	2035	if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
Adam Langley	cb5dd63	2014-06-20 12:00:00 -0700	[diff] [blame]	2036	my @opcode=(0x0f,0x38);
Adam Langley	956665b	2014-06-20 12:00:00 -0700	[diff] [blame]	2037	my $rex=0;
				2038	$rex\|=0x04 if ($2>=8);
				2039	$rex\|=0x01 if ($1>=8);
				2040	unshift @opcode,0x40\|$rex if ($rex);
Adam Langley	cb5dd63	2014-06-20 12:00:00 -0700	[diff] [blame]	2041	push @opcode,$opcodelet{$instr};
				2042	push @opcode,0xc0\|($1&7)\|(($2&7)<<3); # ModR/M
				2043	return ".byte\t".join(',',@opcode);
				2044	} else {
				2045	return $instr."\t".@_[0];
				2046	}
				2047	}
				2048
				2049	foreach (split("\n",$code)) {
				2050	s/\`([^\`]*)\`/eval $1/geo;
				2051
				2052	s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or
				2053	s/\b(sha1[^\s])\s+(.)/sha1op38($1,$2)/geo;
				2054
				2055	print $_,"\n";
				2056	}
Adam Langley	95c29f3	2014-06-20 12:00:00 -0700	[diff] [blame]	2057	close STDOUT;