blob: 25a1cad007e643b851cfedff9aca70eba43d3b34 [file] [log] [blame]
Adam Langley95c29f32014-06-20 12:00:00 -07001#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# sha1_block procedure for x86_64.
11#
12# It was brought to my attention that on EM64T compiler-generated code
13# was far behind 32-bit assembler implementation. This is unlike on
14# Opteron where compiler-generated code was only 15% behind 32-bit
15# assembler, which originally made it hard to motivate the effort.
16# There was suggestion to mechanically translate 32-bit code, but I
17# dismissed it, reasoning that x86_64 offers enough register bank
18# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
19# implementation:-) However! While 64-bit code does perform better
20# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
21# x86_64 does offer larger *addressable* bank, but out-of-order core
22# reaches for even more registers through dynamic aliasing, and EM64T
23# core must have managed to run-time optimize even 32-bit code just as
24# good as 64-bit one. Performance improvement is summarized in the
25# following table:
26#
27# gcc 3.4 32-bit asm cycles/byte
28# Opteron +45% +20% 6.8
29# Xeon P4 +65% +0% 9.9
30# Core2 +60% +10% 7.0
31
32# August 2009.
33#
34# The code was revised to minimize code size and to maximize
35# "distance" between instructions producing input to 'lea'
36# instruction and the 'lea' instruction itself, which is essential
37# for Intel Atom core.
38
39# October 2010.
40#
41# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
42# is to offload message schedule denoted by Wt in NIST specification,
43# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
44# for background and implementation details. The only difference from
45# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
46# to free temporary registers.
47
48# April 2011.
49#
50# Add AVX code path. See sha1-586.pl for further information.
51
52# May 2013.
53#
54# Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions
55# and loading pair of consecutive blocks to 256-bit %ymm registers)
56# did not provide impressive performance improvement till a crucial
57# hint regarding the number of Xupdate iterations to pre-compute in
58# advance was provided by Ilya Albrekht of Intel Corp.
59
Adam Langleycb5dd632014-06-20 12:00:00 -070060# March 2014.
61#
62# Add support for Intel SHA Extensions.
63
Adam Langley95c29f32014-06-20 12:00:00 -070064######################################################################
65# Current performance is summarized in following table. Numbers are
66# CPU clock cycles spent to process single byte (less is better).
67#
68# x86_64 SSSE3 AVX[2]
Adam Langley5c6ca972014-06-20 12:00:00 -070069# P4 9.05 -
70# Opteron 6.26 -
71# Core2 6.55 6.05/+8% -
72# Westmere 6.73 5.30/+27% -
73# Sandy Bridge 7.70 6.10/+26% 4.99/+54%
74# Ivy Bridge 6.06 4.67/+30% 4.60/+32%
75# Haswell 5.45 4.15/+31% 3.57/+53%
Adam Langley86c06922017-02-09 12:26:22 -080076# Skylake 5.18 4.06/+28% 3.54/+46%
Adam Langley5c6ca972014-06-20 12:00:00 -070077# Bulldozer 9.11 5.95/+53%
78# VIA Nano 9.32 7.15/+30%
Adam Langleycb5dd632014-06-20 12:00:00 -070079# Atom 10.3 9.17/+12%
Adam Langley5c6ca972014-06-20 12:00:00 -070080# Silvermont 13.1(*) 9.37/+40%
Adam Langleyff7fb712017-02-09 12:34:59 -080081# Goldmont 8.13 6.42/+27% 1.70/+380%(**)
Adam Langley5c6ca972014-06-20 12:00:00 -070082#
83# (*) obviously suboptimal result, nothing was done about it,
84# because SSSE3 code is compiled unconditionally;
Adam Langleyff7fb712017-02-09 12:34:59 -080085# (**) SHAEXT result
Adam Langley95c29f32014-06-20 12:00:00 -070086
87$flavour = shift;
88$output = shift;
89if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
90
91$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
92
93$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
94( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
Adam Langleyfd499932017-04-04 14:21:43 -070095( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
Adam Langley95c29f32014-06-20 12:00:00 -070096die "can't locate x86_64-xlate.pl";
97
David Benjamin278d3422015-10-14 14:03:23 -040098# In upstream, this is controlled by shelling out to the compiler to check
99# versions, but BoringSSL is intended to be used with pre-generated perlasm
100# output, so this isn't useful anyway.
101#
David Benjamince7ae6f2015-11-09 22:07:24 -0500102# TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it
103# necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream
104# did not tie them together until after $shaext was added.
105$avx = 1;
Adam Langley2811da22014-07-24 17:28:34 -0700106
David Benjamine189c862015-10-15 13:48:50 -0400107# TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
108# been tested.
Adam Langley006779a2014-06-20 12:00:00 -0700109$shaext=0; ### set to zero if compiling for 1.0.1
110$avx=1 if (!$shaext && $avx);
111
David Benjaminfdd8e9c2016-06-26 13:18:50 -0400112open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
Adam Langley95c29f32014-06-20 12:00:00 -0700113*STDOUT=*OUT;
114
115$ctx="%rdi"; # 1st arg
116$inp="%rsi"; # 2nd arg
117$num="%rdx"; # 3rd arg
118
119# reassign arguments in order to produce more compact code
120$ctx="%r8";
121$inp="%r9";
122$num="%r10";
123
124$t0="%eax";
125$t1="%ebx";
126$t2="%ecx";
Adam Langley5c6ca972014-06-20 12:00:00 -0700127@xi=("%edx","%ebp","%r14d");
Adam Langley95c29f32014-06-20 12:00:00 -0700128$A="%esi";
129$B="%edi";
130$C="%r11d";
131$D="%r12d";
132$E="%r13d";
133
134@V=($A,$B,$C,$D,$E);
135
136sub BODY_00_19 {
137my ($i,$a,$b,$c,$d,$e)=@_;
138my $j=$i+1;
139$code.=<<___ if ($i==0);
140 mov `4*$i`($inp),$xi[0]
141 bswap $xi[0]
Adam Langley95c29f32014-06-20 12:00:00 -0700142___
143$code.=<<___ if ($i<15);
Adam Langley95c29f32014-06-20 12:00:00 -0700144 mov `4*$j`($inp),$xi[1]
Adam Langley5c6ca972014-06-20 12:00:00 -0700145 mov $d,$t0
146 mov $xi[0],`4*$i`(%rsp)
Adam Langley95c29f32014-06-20 12:00:00 -0700147 mov $a,$t2
Adam Langley95c29f32014-06-20 12:00:00 -0700148 bswap $xi[1]
Adam Langley5c6ca972014-06-20 12:00:00 -0700149 xor $c,$t0
Adam Langley95c29f32014-06-20 12:00:00 -0700150 rol \$5,$t2
Adam Langley95c29f32014-06-20 12:00:00 -0700151 and $b,$t0
Adam Langley5c6ca972014-06-20 12:00:00 -0700152 lea 0x5a827999($xi[0],$e),$e
Adam Langley95c29f32014-06-20 12:00:00 -0700153 add $t2,$e
154 xor $d,$t0
155 rol \$30,$b
156 add $t0,$e
157___
158$code.=<<___ if ($i>=15);
Adam Langley5c6ca972014-06-20 12:00:00 -0700159 xor `4*($j%16)`(%rsp),$xi[1]
160 mov $d,$t0
161 mov $xi[0],`4*($i%16)`(%rsp)
Adam Langley95c29f32014-06-20 12:00:00 -0700162 mov $a,$t2
163 xor `4*(($j+2)%16)`(%rsp),$xi[1]
Adam Langley5c6ca972014-06-20 12:00:00 -0700164 xor $c,$t0
Adam Langley95c29f32014-06-20 12:00:00 -0700165 rol \$5,$t2
166 xor `4*(($j+8)%16)`(%rsp),$xi[1]
167 and $b,$t0
168 lea 0x5a827999($xi[0],$e),$e
Adam Langley95c29f32014-06-20 12:00:00 -0700169 rol \$30,$b
Adam Langley5c6ca972014-06-20 12:00:00 -0700170 xor $d,$t0
171 add $t2,$e
172 rol \$1,$xi[1]
Adam Langley95c29f32014-06-20 12:00:00 -0700173 add $t0,$e
174___
Adam Langley5c6ca972014-06-20 12:00:00 -0700175push(@xi,shift(@xi));
Adam Langley95c29f32014-06-20 12:00:00 -0700176}
177
178sub BODY_20_39 {
179my ($i,$a,$b,$c,$d,$e)=@_;
180my $j=$i+1;
181my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
182$code.=<<___ if ($i<79);
Adam Langley5c6ca972014-06-20 12:00:00 -0700183 xor `4*($j%16)`(%rsp),$xi[1]
184 mov $b,$t0
185 `"mov $xi[0],".4*($i%16)."(%rsp)" if ($i<72)`
Adam Langley95c29f32014-06-20 12:00:00 -0700186 mov $a,$t2
187 xor `4*(($j+2)%16)`(%rsp),$xi[1]
Adam Langley95c29f32014-06-20 12:00:00 -0700188 xor $d,$t0
Adam Langley5c6ca972014-06-20 12:00:00 -0700189 rol \$5,$t2
190 xor `4*(($j+8)%16)`(%rsp),$xi[1]
191 lea $K($xi[0],$e),$e
192 xor $c,$t0
Adam Langley95c29f32014-06-20 12:00:00 -0700193 add $t2,$e
Adam Langley95c29f32014-06-20 12:00:00 -0700194 rol \$30,$b
195 add $t0,$e
196 rol \$1,$xi[1]
197___
Adam Langley95c29f32014-06-20 12:00:00 -0700198$code.=<<___ if ($i==79);
Adam Langley5c6ca972014-06-20 12:00:00 -0700199 mov $b,$t0
Adam Langley95c29f32014-06-20 12:00:00 -0700200 mov $a,$t2
Adam Langley5c6ca972014-06-20 12:00:00 -0700201 xor $d,$t0
Adam Langley95c29f32014-06-20 12:00:00 -0700202 lea $K($xi[0],$e),$e
203 rol \$5,$t2
Adam Langley5c6ca972014-06-20 12:00:00 -0700204 xor $c,$t0
Adam Langley95c29f32014-06-20 12:00:00 -0700205 add $t2,$e
206 rol \$30,$b
207 add $t0,$e
208___
Adam Langley5c6ca972014-06-20 12:00:00 -0700209push(@xi,shift(@xi));
Adam Langley95c29f32014-06-20 12:00:00 -0700210}
211
212sub BODY_40_59 {
213my ($i,$a,$b,$c,$d,$e)=@_;
214my $j=$i+1;
215$code.=<<___;
Adam Langley5c6ca972014-06-20 12:00:00 -0700216 xor `4*($j%16)`(%rsp),$xi[1]
217 mov $d,$t0
218 mov $xi[0],`4*($i%16)`(%rsp)
219 mov $d,$t1
Adam Langley95c29f32014-06-20 12:00:00 -0700220 xor `4*(($j+2)%16)`(%rsp),$xi[1]
Adam Langley5c6ca972014-06-20 12:00:00 -0700221 and $c,$t0
Adam Langley95c29f32014-06-20 12:00:00 -0700222 mov $a,$t2
223 xor `4*(($j+8)%16)`(%rsp),$xi[1]
Adam Langley95c29f32014-06-20 12:00:00 -0700224 lea 0x8f1bbcdc($xi[0],$e),$e
Adam Langley5c6ca972014-06-20 12:00:00 -0700225 xor $c,$t1
Adam Langley95c29f32014-06-20 12:00:00 -0700226 rol \$5,$t2
Adam Langley95c29f32014-06-20 12:00:00 -0700227 add $t0,$e
Adam Langley95c29f32014-06-20 12:00:00 -0700228 rol \$1,$xi[1]
Adam Langley5c6ca972014-06-20 12:00:00 -0700229 and $b,$t1
Adam Langley95c29f32014-06-20 12:00:00 -0700230 add $t2,$e
Adam Langley5c6ca972014-06-20 12:00:00 -0700231 rol \$30,$b
232 add $t1,$e
Adam Langley95c29f32014-06-20 12:00:00 -0700233___
Adam Langley5c6ca972014-06-20 12:00:00 -0700234push(@xi,shift(@xi));
Adam Langley95c29f32014-06-20 12:00:00 -0700235}
236
237$code.=<<___;
238.text
Adam Langleyfd499932017-04-04 14:21:43 -0700239.extern OPENSSL_ia32cap_addr
Adam Langley95c29f32014-06-20 12:00:00 -0700240
241.globl sha1_block_data_order
242.type sha1_block_data_order,\@function,3
243.align 16
244sha1_block_data_order:
Adam Langleyfd499932017-04-04 14:21:43 -0700245 lea OPENSSL_ia32cap_addr(%rip),%r10
246 mov (%r10),%r10
247 mov 0(%r10),%r9d
248 mov 4(%r10),%r8d
249 mov 8(%r10),%r10d
Adam Langley95c29f32014-06-20 12:00:00 -0700250 test \$`1<<9`,%r8d # check SSSE3 bit
251 jz .Lialu
Adam Langley006779a2014-06-20 12:00:00 -0700252___
253$code.=<<___ if ($shaext);
Adam Langleyc948d462017-02-09 12:21:08 -0800254 test \$`1<<29`,%r10d # check SHA bit
Adam Langleycb5dd632014-06-20 12:00:00 -0700255 jnz _shaext_shortcut
Adam Langley95c29f32014-06-20 12:00:00 -0700256___
257$code.=<<___ if ($avx>1);
258 and \$`1<<3|1<<5|1<<8`,%r10d # check AVX2+BMI1+BMI2
259 cmp \$`1<<3|1<<5|1<<8`,%r10d
260 je _avx2_shortcut
261___
262$code.=<<___ if ($avx);
263 and \$`1<<28`,%r8d # mask AVX bit
264 and \$`1<<30`,%r9d # mask "Intel CPU" bit
265 or %r9d,%r8d
266 cmp \$`1<<28|1<<30`,%r8d
267 je _avx_shortcut
268___
269$code.=<<___;
270 jmp _ssse3_shortcut
271
272.align 16
273.Lialu:
Adam Langley5c6ca972014-06-20 12:00:00 -0700274 mov %rsp,%rax
Adam Langley95c29f32014-06-20 12:00:00 -0700275 push %rbx
276 push %rbp
277 push %r12
278 push %r13
Adam Langley5c6ca972014-06-20 12:00:00 -0700279 push %r14
Adam Langley95c29f32014-06-20 12:00:00 -0700280 mov %rdi,$ctx # reassigned argument
281 sub \$`8+16*4`,%rsp
282 mov %rsi,$inp # reassigned argument
283 and \$-64,%rsp
284 mov %rdx,$num # reassigned argument
Adam Langley5c6ca972014-06-20 12:00:00 -0700285 mov %rax,`16*4`(%rsp)
Adam Langley95c29f32014-06-20 12:00:00 -0700286.Lprologue:
287
288 mov 0($ctx),$A
289 mov 4($ctx),$B
290 mov 8($ctx),$C
291 mov 12($ctx),$D
292 mov 16($ctx),$E
293 jmp .Lloop
294
295.align 16
296.Lloop:
297___
298for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
299for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
300for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
301for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
302$code.=<<___;
303 add 0($ctx),$A
304 add 4($ctx),$B
305 add 8($ctx),$C
306 add 12($ctx),$D
307 add 16($ctx),$E
308 mov $A,0($ctx)
309 mov $B,4($ctx)
310 mov $C,8($ctx)
311 mov $D,12($ctx)
312 mov $E,16($ctx)
313
314 sub \$1,$num
315 lea `16*4`($inp),$inp
316 jnz .Lloop
317
318 mov `16*4`(%rsp),%rsi
Adam Langley5c6ca972014-06-20 12:00:00 -0700319 mov -40(%rsi),%r14
320 mov -32(%rsi),%r13
321 mov -24(%rsi),%r12
322 mov -16(%rsi),%rbp
323 mov -8(%rsi),%rbx
324 lea (%rsi),%rsp
Adam Langley95c29f32014-06-20 12:00:00 -0700325.Lepilogue:
326 ret
327.size sha1_block_data_order,.-sha1_block_data_order
328___
Adam Langley006779a2014-06-20 12:00:00 -0700329if ($shaext) {{{
Adam Langleycb5dd632014-06-20 12:00:00 -0700330######################################################################
331# Intel SHA Extensions implementation of SHA1 update function.
332#
333my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
334my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
335my @MSG=map("%xmm$_",(4..7));
336
337$code.=<<___;
338.type sha1_block_data_order_shaext,\@function,3
339.align 32
340sha1_block_data_order_shaext:
341_shaext_shortcut:
342___
343$code.=<<___ if ($win64);
344 lea `-8-4*16`(%rsp),%rsp
345 movaps %xmm6,-8-4*16(%rax)
346 movaps %xmm7,-8-3*16(%rax)
347 movaps %xmm8,-8-2*16(%rax)
348 movaps %xmm9,-8-1*16(%rax)
349.Lprologue_shaext:
350___
351$code.=<<___;
352 movdqu ($ctx),$ABCD
353 movd 16($ctx),$E
354 movdqa K_XX_XX+0xa0(%rip),$BSWAP # byte-n-word swap
355
356 movdqu ($inp),@MSG[0]
357 pshufd \$0b00011011,$ABCD,$ABCD # flip word order
358 movdqu 0x10($inp),@MSG[1]
359 pshufd \$0b00011011,$E,$E # flip word order
360 movdqu 0x20($inp),@MSG[2]
361 pshufb $BSWAP,@MSG[0]
362 movdqu 0x30($inp),@MSG[3]
363 pshufb $BSWAP,@MSG[1]
364 pshufb $BSWAP,@MSG[2]
365 movdqa $E,$E_SAVE # offload $E
366 pshufb $BSWAP,@MSG[3]
367 jmp .Loop_shaext
368
369.align 16
370.Loop_shaext:
371 dec $num
David Benjamin722ba2d2016-08-16 01:54:53 -0400372 lea 0x40($inp),%r8 # next input block
Adam Langleycb5dd632014-06-20 12:00:00 -0700373 paddd @MSG[0],$E
David Benjamin722ba2d2016-08-16 01:54:53 -0400374 cmovne %r8,$inp
Adam Langleycb5dd632014-06-20 12:00:00 -0700375 movdqa $ABCD,$ABCD_SAVE # offload $ABCD
376___
377for($i=0;$i<20-4;$i+=2) {
378$code.=<<___;
379 sha1msg1 @MSG[1],@MSG[0]
380 movdqa $ABCD,$E_
381 sha1rnds4 \$`int($i/5)`,$E,$ABCD # 0-3...
382 sha1nexte @MSG[1],$E_
383 pxor @MSG[2],@MSG[0]
384 sha1msg1 @MSG[2],@MSG[1]
385 sha1msg2 @MSG[3],@MSG[0]
386
387 movdqa $ABCD,$E
388 sha1rnds4 \$`int(($i+1)/5)`,$E_,$ABCD
389 sha1nexte @MSG[2],$E
390 pxor @MSG[3],@MSG[1]
391 sha1msg2 @MSG[0],@MSG[1]
392___
393 push(@MSG,shift(@MSG)); push(@MSG,shift(@MSG));
394}
395$code.=<<___;
396 movdqu ($inp),@MSG[0]
397 movdqa $ABCD,$E_
398 sha1rnds4 \$3,$E,$ABCD # 64-67
399 sha1nexte @MSG[1],$E_
400 movdqu 0x10($inp),@MSG[1]
401 pshufb $BSWAP,@MSG[0]
402
403 movdqa $ABCD,$E
404 sha1rnds4 \$3,$E_,$ABCD # 68-71
405 sha1nexte @MSG[2],$E
406 movdqu 0x20($inp),@MSG[2]
407 pshufb $BSWAP,@MSG[1]
408
409 movdqa $ABCD,$E_
410 sha1rnds4 \$3,$E,$ABCD # 72-75
411 sha1nexte @MSG[3],$E_
412 movdqu 0x30($inp),@MSG[3]
413 pshufb $BSWAP,@MSG[2]
414
415 movdqa $ABCD,$E
416 sha1rnds4 \$3,$E_,$ABCD # 76-79
417 sha1nexte $E_SAVE,$E
418 pshufb $BSWAP,@MSG[3]
419
420 paddd $ABCD_SAVE,$ABCD
421 movdqa $E,$E_SAVE # offload $E
422
423 jnz .Loop_shaext
424
425 pshufd \$0b00011011,$ABCD,$ABCD
426 pshufd \$0b00011011,$E,$E
427 movdqu $ABCD,($ctx)
428 movd $E,16($ctx)
429___
430$code.=<<___ if ($win64);
431 movaps -8-4*16(%rax),%xmm6
432 movaps -8-3*16(%rax),%xmm7
433 movaps -8-2*16(%rax),%xmm8
434 movaps -8-1*16(%rax),%xmm9
435 mov %rax,%rsp
436.Lepilogue_shaext:
437___
438$code.=<<___;
439 ret
440.size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
441___
442}}}
443{{{
Adam Langley95c29f32014-06-20 12:00:00 -0700444my $Xi=4;
445my @X=map("%xmm$_",(4..7,0..3));
446my @Tx=map("%xmm$_",(8..10));
447my $Kx="%xmm11";
448my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
449my @T=("%esi","%edi");
450my $j=0;
451my $rx=0;
Adam Langleycb1b3332017-02-09 14:17:39 -0800452my $K_XX_XX="%r14";
453my $fp="%r11";
Adam Langley95c29f32014-06-20 12:00:00 -0700454
455my $_rol=sub { &rol(@_) };
456my $_ror=sub { &ror(@_) };
457
458{ my $sn;
459sub align32() {
460 ++$sn;
461$code.=<<___;
462 jmp .Lalign32_$sn # see "Decoded ICache" in manual
463.align 32
464.Lalign32_$sn:
465___
466}
467}
468
469$code.=<<___;
470.type sha1_block_data_order_ssse3,\@function,3
471.align 16
472sha1_block_data_order_ssse3:
473_ssse3_shortcut:
Adam Langleycb1b3332017-02-09 14:17:39 -0800474 mov %rsp,$fp # frame pointer
Adam Langley95c29f32014-06-20 12:00:00 -0700475 push %rbx
476 push %rbp
477 push %r12
478 push %r13 # redundant, done to share Win64 SE handler
479 push %r14
480 lea `-64-($win64?6*16:0)`(%rsp),%rsp
481___
482$code.=<<___ if ($win64);
Adam Langleycb1b3332017-02-09 14:17:39 -0800483 movaps %xmm6,-40-6*16($fp)
484 movaps %xmm7,-40-5*16($fp)
485 movaps %xmm8,-40-4*16($fp)
486 movaps %xmm9,-40-3*16($fp)
487 movaps %xmm10,-40-2*16($fp)
488 movaps %xmm11,-40-1*16($fp)
Adam Langley95c29f32014-06-20 12:00:00 -0700489.Lprologue_ssse3:
490___
491$code.=<<___;
Adam Langley95c29f32014-06-20 12:00:00 -0700492 and \$-64,%rsp
493 mov %rdi,$ctx # reassigned argument
494 mov %rsi,$inp # reassigned argument
495 mov %rdx,$num # reassigned argument
496
497 shl \$6,$num
498 add $inp,$num
499 lea K_XX_XX+64(%rip),$K_XX_XX
500
501 mov 0($ctx),$A # load context
502 mov 4($ctx),$B
503 mov 8($ctx),$C
504 mov 12($ctx),$D
505 mov $B,@T[0] # magic seed
506 mov 16($ctx),$E
507 mov $C,@T[1]
508 xor $D,@T[1]
509 and @T[1],@T[0]
510
511 movdqa 64($K_XX_XX),@X[2] # pbswap mask
512 movdqa -64($K_XX_XX),@Tx[1] # K_00_19
513 movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
514 movdqu 16($inp),@X[-3&7]
515 movdqu 32($inp),@X[-2&7]
516 movdqu 48($inp),@X[-1&7]
517 pshufb @X[2],@X[-4&7] # byte swap
Adam Langley95c29f32014-06-20 12:00:00 -0700518 pshufb @X[2],@X[-3&7]
519 pshufb @X[2],@X[-2&7]
Adam Langley5c6ca972014-06-20 12:00:00 -0700520 add \$64,$inp
Adam Langley95c29f32014-06-20 12:00:00 -0700521 paddd @Tx[1],@X[-4&7] # add K_00_19
Adam Langley5c6ca972014-06-20 12:00:00 -0700522 pshufb @X[2],@X[-1&7]
Adam Langley95c29f32014-06-20 12:00:00 -0700523 paddd @Tx[1],@X[-3&7]
524 paddd @Tx[1],@X[-2&7]
525 movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
526 psubd @Tx[1],@X[-4&7] # restore X[]
527 movdqa @X[-3&7],16(%rsp)
528 psubd @Tx[1],@X[-3&7]
529 movdqa @X[-2&7],32(%rsp)
530 psubd @Tx[1],@X[-2&7]
531 jmp .Loop_ssse3
532___
533
534sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
535{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
536 my $arg = pop;
537 $arg = "\$$arg" if ($arg*1 eq $arg);
538 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
539}
540
541sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
542{ use integer;
543 my $body = shift;
544 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
545 my ($a,$b,$c,$d,$e);
546
Adam Langley5c6ca972014-06-20 12:00:00 -0700547 eval(shift(@insns)); # ror
548 &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
Adam Langley95c29f32014-06-20 12:00:00 -0700549 eval(shift(@insns));
550 &movdqa (@Tx[0],@X[-1&7]);
Adam Langley5c6ca972014-06-20 12:00:00 -0700551 &paddd (@Tx[1],@X[-1&7]);
Adam Langley95c29f32014-06-20 12:00:00 -0700552 eval(shift(@insns));
553 eval(shift(@insns));
554
Adam Langley5c6ca972014-06-20 12:00:00 -0700555 &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
Adam Langley95c29f32014-06-20 12:00:00 -0700556 eval(shift(@insns));
Adam Langley5c6ca972014-06-20 12:00:00 -0700557 eval(shift(@insns)); # rol
Adam Langley95c29f32014-06-20 12:00:00 -0700558 eval(shift(@insns));
559 &psrldq (@Tx[0],4); # "X[-3]", 3 dwords
560 eval(shift(@insns));
561 eval(shift(@insns));
Adam Langley5c6ca972014-06-20 12:00:00 -0700562
Adam Langley95c29f32014-06-20 12:00:00 -0700563 &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
564 eval(shift(@insns));
Adam Langley5c6ca972014-06-20 12:00:00 -0700565 eval(shift(@insns)); # ror
Adam Langley95c29f32014-06-20 12:00:00 -0700566 &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
567 eval(shift(@insns));
568 eval(shift(@insns));
569 eval(shift(@insns));
Adam Langley95c29f32014-06-20 12:00:00 -0700570
571 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
572 eval(shift(@insns));
Adam Langley5c6ca972014-06-20 12:00:00 -0700573 eval(shift(@insns)); # rol
Adam Langley95c29f32014-06-20 12:00:00 -0700574 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
575 eval(shift(@insns));
576 eval(shift(@insns));
577
578 &movdqa (@Tx[2],@X[0]);
Adam Langley5c6ca972014-06-20 12:00:00 -0700579 eval(shift(@insns));
580 eval(shift(@insns));
581 eval(shift(@insns)); # ror
Adam Langley95c29f32014-06-20 12:00:00 -0700582 &movdqa (@Tx[0],@X[0]);
583 eval(shift(@insns));
Adam Langley95c29f32014-06-20 12:00:00 -0700584
585 &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
586 &paddd (@X[0],@X[0]);
587 eval(shift(@insns));
588 eval(shift(@insns));
Adam Langley95c29f32014-06-20 12:00:00 -0700589
590 &psrld (@Tx[0],31);
591 eval(shift(@insns));
Adam Langley5c6ca972014-06-20 12:00:00 -0700592 eval(shift(@insns)); # rol
Adam Langley95c29f32014-06-20 12:00:00 -0700593 eval(shift(@insns));
594 &movdqa (@Tx[1],@Tx[2]);
595 eval(shift(@insns));
596 eval(shift(@insns));
597
598 &psrld (@Tx[2],30);
Adam Langley95c29f32014-06-20 12:00:00 -0700599 eval(shift(@insns));
Adam Langley5c6ca972014-06-20 12:00:00 -0700600 eval(shift(@insns)); # ror
601 &por (@X[0],@Tx[0]); # "X[0]"<<<=1
Adam Langley95c29f32014-06-20 12:00:00 -0700602 eval(shift(@insns));
603 eval(shift(@insns));
604 eval(shift(@insns));
605
606 &pslld (@Tx[1],2);
607 &pxor (@X[0],@Tx[2]);
608 eval(shift(@insns));
Adam Langley95c29f32014-06-20 12:00:00 -0700609 &movdqa (@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)"); # K_XX_XX
Adam Langley5c6ca972014-06-20 12:00:00 -0700610 eval(shift(@insns)); # rol
Adam Langley95c29f32014-06-20 12:00:00 -0700611 eval(shift(@insns));
612 eval(shift(@insns));
613
614 &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
Adam Langley5c6ca972014-06-20 12:00:00 -0700615 &pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
Adam Langley95c29f32014-06-20 12:00:00 -0700616
617 foreach (@insns) { eval; } # remaining instructions [if any]
618
619 $Xi++; push(@X,shift(@X)); # "rotate" X[]
620 push(@Tx,shift(@Tx));
621}
622
623sub Xupdate_ssse3_32_79()
624{ use integer;
625 my $body = shift;
626 my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
627 my ($a,$b,$c,$d,$e);
628
Adam Langley5c6ca972014-06-20 12:00:00 -0700629 eval(shift(@insns)) if ($Xi==8);
Adam Langley95c29f32014-06-20 12:00:00 -0700630 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
Adam Langley5c6ca972014-06-20 12:00:00 -0700631 eval(shift(@insns)) if ($Xi==8);
632 eval(shift(@insns)); # body_20_39
Adam Langley95c29f32014-06-20 12:00:00 -0700633 eval(shift(@insns));
Adam Langley5c6ca972014-06-20 12:00:00 -0700634 eval(shift(@insns)) if (@insns[1] =~ /_ror/);
635 eval(shift(@insns)) if (@insns[0] =~ /_ror/);
636 &punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
Adam Langley95c29f32014-06-20 12:00:00 -0700637 eval(shift(@insns));
638 eval(shift(@insns)); # rol
639
640 &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
641 eval(shift(@insns));
Adam Langley5c6ca972014-06-20 12:00:00 -0700642 eval(shift(@insns));
Adam Langley95c29f32014-06-20 12:00:00 -0700643 if ($Xi%5) {
644 &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
645 } else { # ... or load next one
646 &movdqa (@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
647 }
Adam Langley95c29f32014-06-20 12:00:00 -0700648 eval(shift(@insns)); # ror
Adam Langley5c6ca972014-06-20 12:00:00 -0700649 &paddd (@Tx[1],@X[-1&7]);
Adam Langley95c29f32014-06-20 12:00:00 -0700650 eval(shift(@insns));
651
652 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
653 eval(shift(@insns)); # body_20_39
654 eval(shift(@insns));
655 eval(shift(@insns));
656 eval(shift(@insns)); # rol
Adam Langley5c6ca972014-06-20 12:00:00 -0700657 eval(shift(@insns)) if (@insns[0] =~ /_ror/);
Adam Langley95c29f32014-06-20 12:00:00 -0700658
659 &movdqa (@Tx[0],@X[0]);
Adam Langley5c6ca972014-06-20 12:00:00 -0700660 eval(shift(@insns));
661 eval(shift(@insns));
Adam Langley95c29f32014-06-20 12:00:00 -0700662 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
Adam Langley95c29f32014-06-20 12:00:00 -0700663 eval(shift(@insns)); # ror
664 eval(shift(@insns));
Adam Langley5c6ca972014-06-20 12:00:00 -0700665 eval(shift(@insns)); # body_20_39
Adam Langley95c29f32014-06-20 12:00:00 -0700666
667 &pslld (@X[0],2);
Adam Langley5c6ca972014-06-20 12:00:00 -0700668 eval(shift(@insns));
Adam Langley95c29f32014-06-20 12:00:00 -0700669 eval(shift(@insns));
670 &psrld (@Tx[0],30);
Adam Langley5c6ca972014-06-20 12:00:00 -0700671 eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol
Adam Langley95c29f32014-06-20 12:00:00 -0700672 eval(shift(@insns));
673 eval(shift(@insns));
674 eval(shift(@insns)); # ror
Adam Langley95c29f32014-06-20 12:00:00 -0700675
676 &por (@X[0],@Tx[0]); # "X[0]"<<<=2
Adam Langley95c29f32014-06-20 12:00:00 -0700677 eval(shift(@insns));
Adam Langley5c6ca972014-06-20 12:00:00 -0700678 eval(shift(@insns)); # body_20_39
679 eval(shift(@insns)) if (@insns[1] =~ /_rol/);
680 eval(shift(@insns)) if (@insns[0] =~ /_rol/);
681 &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0])
Adam Langley95c29f32014-06-20 12:00:00 -0700682 eval(shift(@insns));
683 eval(shift(@insns)); # rol
684 eval(shift(@insns));
685 eval(shift(@insns));
686 eval(shift(@insns)); # rol
687 eval(shift(@insns));
688
689 foreach (@insns) { eval; } # remaining instructions
690
691 $Xi++; push(@X,shift(@X)); # "rotate" X[]
692 push(@Tx,shift(@Tx));
693}
694
695sub Xuplast_ssse3_80()
696{ use integer;
697 my $body = shift;
698 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
699 my ($a,$b,$c,$d,$e);
700
701 eval(shift(@insns));
Adam Langley5c6ca972014-06-20 12:00:00 -0700702 eval(shift(@insns));
703 eval(shift(@insns));
704 eval(shift(@insns));
Adam Langley95c29f32014-06-20 12:00:00 -0700705 &paddd (@Tx[1],@X[-1&7]);
706 eval(shift(@insns));
707 eval(shift(@insns));
Adam Langley95c29f32014-06-20 12:00:00 -0700708
709 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
710
711 foreach (@insns) { eval; } # remaining instructions
712
713 &cmp ($inp,$num);
714 &je (".Ldone_ssse3");
715
716 unshift(@Tx,pop(@Tx));
717
718 &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask
719 &movdqa (@Tx[1],"-64($K_XX_XX)"); # K_00_19
720 &movdqu (@X[-4&7],"0($inp)"); # load input
721 &movdqu (@X[-3&7],"16($inp)");
722 &movdqu (@X[-2&7],"32($inp)");
723 &movdqu (@X[-1&7],"48($inp)");
724 &pshufb (@X[-4&7],@X[2]); # byte swap
725 &add ($inp,64);
726
727 $Xi=0;
728}
729
730sub Xloop_ssse3()
731{ use integer;
732 my $body = shift;
733 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
734 my ($a,$b,$c,$d,$e);
735
736 eval(shift(@insns));
737 eval(shift(@insns));
Adam Langley5c6ca972014-06-20 12:00:00 -0700738 eval(shift(@insns));
Adam Langley95c29f32014-06-20 12:00:00 -0700739 &pshufb (@X[($Xi-3)&7],@X[2]);
740 eval(shift(@insns));
741 eval(shift(@insns));
742 eval(shift(@insns));
Adam Langley5c6ca972014-06-20 12:00:00 -0700743 eval(shift(@insns));
Adam Langley95c29f32014-06-20 12:00:00 -0700744 &paddd (@X[($Xi-4)&7],@Tx[1]);
745 eval(shift(@insns));
746 eval(shift(@insns));
747 eval(shift(@insns));
748 eval(shift(@insns));
749 &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
750 eval(shift(@insns));
751 eval(shift(@insns));
Adam Langley5c6ca972014-06-20 12:00:00 -0700752 eval(shift(@insns));
753 eval(shift(@insns));
Adam Langley95c29f32014-06-20 12:00:00 -0700754 &psubd (@X[($Xi-4)&7],@Tx[1]);
755
756 foreach (@insns) { eval; }
757 $Xi++;
758}
759
760sub Xtail_ssse3()
761{ use integer;
762 my $body = shift;
763 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
764 my ($a,$b,$c,$d,$e);
765
766 foreach (@insns) { eval; }
767}
768
769sub body_00_19 () { # ((c^d)&b)^d
770 # on start @T[0]=(c^d)&b
771 return &body_20_39() if ($rx==19); $rx++;
772 (
773 '($a,$b,$c,$d,$e)=@V;'.
774 '&$_ror ($b,$j?7:2)', # $b>>>2
775 '&xor (@T[0],$d)',
776 '&mov (@T[1],$a)', # $b for next round
777
778 '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
779 '&xor ($b,$c)', # $c^$d for next round
780
781 '&$_rol ($a,5)',
782 '&add ($e,@T[0])',
783 '&and (@T[1],$b)', # ($b&($c^$d)) for next round
784
785 '&xor ($b,$c)', # restore $b
786 '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
787 );
788}
789
790sub body_20_39 () { # b^d^c
791 # on entry @T[0]=b^d
792 return &body_40_59() if ($rx==39); $rx++;
793 (
794 '($a,$b,$c,$d,$e)=@V;'.
795 '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
796 '&xor (@T[0],$d) if($j==19);'.
797 '&xor (@T[0],$c) if($j> 19)', # ($b^$d^$c)
798 '&mov (@T[1],$a)', # $b for next round
799
800 '&$_rol ($a,5)',
801 '&add ($e,@T[0])',
802 '&xor (@T[1],$c) if ($j< 79)', # $b^$d for next round
803
804 '&$_ror ($b,7)', # $b>>>2
805 '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
806 );
807}
808
809sub body_40_59 () { # ((b^c)&(c^d))^c
810 # on entry @T[0]=(b^c), (c^=d)
811 $rx++;
812 (
813 '($a,$b,$c,$d,$e)=@V;'.
814 '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
815 '&and (@T[0],$c) if ($j>=40)', # (b^c)&(c^d)
816 '&xor ($c,$d) if ($j>=40)', # restore $c
817
818 '&$_ror ($b,7)', # $b>>>2
819 '&mov (@T[1],$a)', # $b for next round
820 '&xor (@T[0],$c)',
821
822 '&$_rol ($a,5)',
823 '&add ($e,@T[0])',
824 '&xor (@T[1],$c) if ($j==59);'.
825 '&xor (@T[1],$b) if ($j< 59)', # b^c for next round
826
827 '&xor ($b,$c) if ($j< 59)', # c^d for next round
828 '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
829 );
830}
831$code.=<<___;
832.align 16
833.Loop_ssse3:
834___
835 &Xupdate_ssse3_16_31(\&body_00_19);
836 &Xupdate_ssse3_16_31(\&body_00_19);
837 &Xupdate_ssse3_16_31(\&body_00_19);
838 &Xupdate_ssse3_16_31(\&body_00_19);
839 &Xupdate_ssse3_32_79(\&body_00_19);
840 &Xupdate_ssse3_32_79(\&body_20_39);
841 &Xupdate_ssse3_32_79(\&body_20_39);
842 &Xupdate_ssse3_32_79(\&body_20_39);
843 &Xupdate_ssse3_32_79(\&body_20_39);
844 &Xupdate_ssse3_32_79(\&body_20_39);
845 &Xupdate_ssse3_32_79(\&body_40_59);
846 &Xupdate_ssse3_32_79(\&body_40_59);
847 &Xupdate_ssse3_32_79(\&body_40_59);
848 &Xupdate_ssse3_32_79(\&body_40_59);
849 &Xupdate_ssse3_32_79(\&body_40_59);
850 &Xupdate_ssse3_32_79(\&body_20_39);
851 &Xuplast_ssse3_80(\&body_20_39); # can jump to "done"
852
853 $saved_j=$j; @saved_V=@V;
854
855 &Xloop_ssse3(\&body_20_39);
856 &Xloop_ssse3(\&body_20_39);
857 &Xloop_ssse3(\&body_20_39);
858
859$code.=<<___;
860 add 0($ctx),$A # update context
861 add 4($ctx),@T[0]
862 add 8($ctx),$C
863 add 12($ctx),$D
864 mov $A,0($ctx)
865 add 16($ctx),$E
866 mov @T[0],4($ctx)
867 mov @T[0],$B # magic seed
868 mov $C,8($ctx)
869 mov $C,@T[1]
870 mov $D,12($ctx)
871 xor $D,@T[1]
872 mov $E,16($ctx)
873 and @T[1],@T[0]
874 jmp .Loop_ssse3
875
876.align 16
877.Ldone_ssse3:
878___
879 $j=$saved_j; @V=@saved_V;
880
881 &Xtail_ssse3(\&body_20_39);
882 &Xtail_ssse3(\&body_20_39);
883 &Xtail_ssse3(\&body_20_39);
884
885$code.=<<___;
886 add 0($ctx),$A # update context
887 add 4($ctx),@T[0]
888 add 8($ctx),$C
889 mov $A,0($ctx)
890 add 12($ctx),$D
891 mov @T[0],4($ctx)
892 add 16($ctx),$E
893 mov $C,8($ctx)
894 mov $D,12($ctx)
895 mov $E,16($ctx)
896___
897$code.=<<___ if ($win64);
Adam Langleycb1b3332017-02-09 14:17:39 -0800898 movaps -40-6*16($fp),%xmm6
899 movaps -40-5*16($fp),%xmm7
900 movaps -40-4*16($fp),%xmm8
901 movaps -40-3*16($fp),%xmm9
902 movaps -40-2*16($fp),%xmm10
903 movaps -40-1*16($fp),%xmm11
Adam Langley95c29f32014-06-20 12:00:00 -0700904___
905$code.=<<___;
Adam Langleycb1b3332017-02-09 14:17:39 -0800906 mov -40($fp),%r14
907 mov -32($fp),%r13
908 mov -24($fp),%r12
909 mov -16($fp),%rbp
910 mov -8($fp),%rbx
911 lea ($fp),%rsp
Adam Langley95c29f32014-06-20 12:00:00 -0700912.Lepilogue_ssse3:
913 ret
914.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
915___
916
917if ($avx) {
918$Xi=4; # reset variables
919@X=map("%xmm$_",(4..7,0..3));
920@Tx=map("%xmm$_",(8..10));
921$j=0;
922$rx=0;
923
924my $done_avx_label=".Ldone_avx";
925
926my $_rol=sub { &shld(@_[0],@_) };
927my $_ror=sub { &shrd(@_[0],@_) };
928
929$code.=<<___;
930.type sha1_block_data_order_avx,\@function,3
931.align 16
932sha1_block_data_order_avx:
933_avx_shortcut:
Adam Langleycb1b3332017-02-09 14:17:39 -0800934 mov %rsp,$fp
Adam Langley95c29f32014-06-20 12:00:00 -0700935 push %rbx
936 push %rbp
937 push %r12
938 push %r13 # redundant, done to share Win64 SE handler
939 push %r14
940 lea `-64-($win64?6*16:0)`(%rsp),%rsp
941 vzeroupper
942___
943$code.=<<___ if ($win64);
Adam Langleycb1b3332017-02-09 14:17:39 -0800944 vmovaps %xmm6,-40-6*16($fp)
945 vmovaps %xmm7,-40-5*16($fp)
946 vmovaps %xmm8,-40-4*16($fp)
947 vmovaps %xmm9,-40-3*16($fp)
948 vmovaps %xmm10,-40-2*16($fp)
949 vmovaps %xmm11,-40-1*16($fp)
Adam Langley95c29f32014-06-20 12:00:00 -0700950.Lprologue_avx:
951___
952$code.=<<___;
Adam Langley95c29f32014-06-20 12:00:00 -0700953 and \$-64,%rsp
954 mov %rdi,$ctx # reassigned argument
955 mov %rsi,$inp # reassigned argument
956 mov %rdx,$num # reassigned argument
957
958 shl \$6,$num
959 add $inp,$num
960 lea K_XX_XX+64(%rip),$K_XX_XX
961
962 mov 0($ctx),$A # load context
963 mov 4($ctx),$B
964 mov 8($ctx),$C
965 mov 12($ctx),$D
966 mov $B,@T[0] # magic seed
967 mov 16($ctx),$E
968 mov $C,@T[1]
969 xor $D,@T[1]
970 and @T[1],@T[0]
971
972 vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
973 vmovdqa -64($K_XX_XX),$Kx # K_00_19
974 vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
975 vmovdqu 16($inp),@X[-3&7]
976 vmovdqu 32($inp),@X[-2&7]
977 vmovdqu 48($inp),@X[-1&7]
978 vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
979 add \$64,$inp
980 vpshufb @X[2],@X[-3&7],@X[-3&7]
981 vpshufb @X[2],@X[-2&7],@X[-2&7]
982 vpshufb @X[2],@X[-1&7],@X[-1&7]
983 vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
984 vpaddd $Kx,@X[-3&7],@X[1]
985 vpaddd $Kx,@X[-2&7],@X[2]
986 vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
987 vmovdqa @X[1],16(%rsp)
988 vmovdqa @X[2],32(%rsp)
989 jmp .Loop_avx
990___
991
992sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
993{ use integer;
994 my $body = shift;
995 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
996 my ($a,$b,$c,$d,$e);
997
998 eval(shift(@insns));
999 eval(shift(@insns));
1000 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
1001 eval(shift(@insns));
1002 eval(shift(@insns));
1003
1004 &vpaddd (@Tx[1],$Kx,@X[-1&7]);
1005 eval(shift(@insns));
1006 eval(shift(@insns));
1007 &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
1008 eval(shift(@insns));
1009 eval(shift(@insns));
1010 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
1011 eval(shift(@insns));
1012 eval(shift(@insns));
1013
1014 &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
1015 eval(shift(@insns));
1016 eval(shift(@insns));
1017 eval(shift(@insns));
1018 eval(shift(@insns));
1019
1020 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
1021 eval(shift(@insns));
1022 eval(shift(@insns));
1023 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
1024 eval(shift(@insns));
1025 eval(shift(@insns));
1026
1027 &vpsrld (@Tx[0],@X[0],31);
1028 eval(shift(@insns));
1029 eval(shift(@insns));
1030 eval(shift(@insns));
1031 eval(shift(@insns));
1032
1033 &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
1034 &vpaddd (@X[0],@X[0],@X[0]);
1035 eval(shift(@insns));
1036 eval(shift(@insns));
1037 eval(shift(@insns));
1038 eval(shift(@insns));
1039
1040 &vpsrld (@Tx[1],@Tx[2],30);
1041 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
1042 eval(shift(@insns));
1043 eval(shift(@insns));
1044 eval(shift(@insns));
1045 eval(shift(@insns));
1046
1047 &vpslld (@Tx[2],@Tx[2],2);
1048 &vpxor (@X[0],@X[0],@Tx[1]);
1049 eval(shift(@insns));
1050 eval(shift(@insns));
1051 eval(shift(@insns));
1052 eval(shift(@insns));
1053
1054 &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
1055 eval(shift(@insns));
1056 eval(shift(@insns));
1057 &vmovdqa ($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
1058 eval(shift(@insns));
1059 eval(shift(@insns));
1060
1061
1062 foreach (@insns) { eval; } # remaining instructions [if any]
1063
1064 $Xi++; push(@X,shift(@X)); # "rotate" X[]
1065}
1066
1067sub Xupdate_avx_32_79()
1068{ use integer;
1069 my $body = shift;
1070 my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
1071 my ($a,$b,$c,$d,$e);
1072
1073 &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
1074 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
1075 eval(shift(@insns)); # body_20_39
1076 eval(shift(@insns));
1077 eval(shift(@insns));
1078 eval(shift(@insns)); # rol
1079
1080 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
1081 eval(shift(@insns));
1082 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
1083 &vpaddd (@Tx[1],$Kx,@X[-1&7]);
1084 &vmovdqa ($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0);
1085 eval(shift(@insns)); # ror
1086 eval(shift(@insns));
1087
1088 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
1089 eval(shift(@insns)); # body_20_39
1090 eval(shift(@insns));
1091 eval(shift(@insns));
1092 eval(shift(@insns)); # rol
1093
1094 &vpsrld (@Tx[0],@X[0],30);
1095 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
1096 eval(shift(@insns));
1097 eval(shift(@insns));
1098 eval(shift(@insns)); # ror
1099 eval(shift(@insns));
1100
1101 &vpslld (@X[0],@X[0],2);
1102 eval(shift(@insns)); # body_20_39
1103 eval(shift(@insns));
1104 eval(shift(@insns));
1105 eval(shift(@insns)); # rol
1106 eval(shift(@insns));
1107 eval(shift(@insns));
1108 eval(shift(@insns)); # ror
1109 eval(shift(@insns));
1110
1111 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
1112 eval(shift(@insns)); # body_20_39
1113 eval(shift(@insns));
1114 eval(shift(@insns));
1115 eval(shift(@insns)); # rol
1116 eval(shift(@insns));
1117 eval(shift(@insns));
1118 eval(shift(@insns)); # rol
1119 eval(shift(@insns));
1120
1121 foreach (@insns) { eval; } # remaining instructions
1122
1123 $Xi++; push(@X,shift(@X)); # "rotate" X[]
1124}
1125
1126sub Xuplast_avx_80()
1127{ use integer;
1128 my $body = shift;
1129 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
1130 my ($a,$b,$c,$d,$e);
1131
1132 eval(shift(@insns));
1133 &vpaddd (@Tx[1],$Kx,@X[-1&7]);
1134 eval(shift(@insns));
1135 eval(shift(@insns));
1136 eval(shift(@insns));
1137 eval(shift(@insns));
1138
1139 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
1140
1141 foreach (@insns) { eval; } # remaining instructions
1142
1143 &cmp ($inp,$num);
1144 &je ($done_avx_label);
1145
1146 &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask
1147 &vmovdqa($Kx,"-64($K_XX_XX)"); # K_00_19
1148 &vmovdqu(@X[-4&7],"0($inp)"); # load input
1149 &vmovdqu(@X[-3&7],"16($inp)");
1150 &vmovdqu(@X[-2&7],"32($inp)");
1151 &vmovdqu(@X[-1&7],"48($inp)");
1152 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
1153 &add ($inp,64);
1154
1155 $Xi=0;
1156}
1157
1158sub Xloop_avx()
1159{ use integer;
1160 my $body = shift;
1161 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
1162 my ($a,$b,$c,$d,$e);
1163
1164 eval(shift(@insns));
1165 eval(shift(@insns));
1166 &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
1167 eval(shift(@insns));
1168 eval(shift(@insns));
1169 &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],$Kx);
1170 eval(shift(@insns));
1171 eval(shift(@insns));
1172 eval(shift(@insns));
1173 eval(shift(@insns));
1174 &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU
1175 eval(shift(@insns));
1176 eval(shift(@insns));
1177
1178 foreach (@insns) { eval; }
1179 $Xi++;
1180}
1181
1182sub Xtail_avx()
1183{ use integer;
1184 my $body = shift;
1185 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
1186 my ($a,$b,$c,$d,$e);
1187
1188 foreach (@insns) { eval; }
1189}
1190
1191$code.=<<___;
1192.align 16
1193.Loop_avx:
1194___
1195 &Xupdate_avx_16_31(\&body_00_19);
1196 &Xupdate_avx_16_31(\&body_00_19);
1197 &Xupdate_avx_16_31(\&body_00_19);
1198 &Xupdate_avx_16_31(\&body_00_19);
1199 &Xupdate_avx_32_79(\&body_00_19);
1200 &Xupdate_avx_32_79(\&body_20_39);
1201 &Xupdate_avx_32_79(\&body_20_39);
1202 &Xupdate_avx_32_79(\&body_20_39);
1203 &Xupdate_avx_32_79(\&body_20_39);
1204 &Xupdate_avx_32_79(\&body_20_39);
1205 &Xupdate_avx_32_79(\&body_40_59);
1206 &Xupdate_avx_32_79(\&body_40_59);
1207 &Xupdate_avx_32_79(\&body_40_59);
1208 &Xupdate_avx_32_79(\&body_40_59);
1209 &Xupdate_avx_32_79(\&body_40_59);
1210 &Xupdate_avx_32_79(\&body_20_39);
1211 &Xuplast_avx_80(\&body_20_39); # can jump to "done"
1212
1213 $saved_j=$j; @saved_V=@V;
1214
1215 &Xloop_avx(\&body_20_39);
1216 &Xloop_avx(\&body_20_39);
1217 &Xloop_avx(\&body_20_39);
1218
1219$code.=<<___;
1220 add 0($ctx),$A # update context
1221 add 4($ctx),@T[0]
1222 add 8($ctx),$C
1223 add 12($ctx),$D
1224 mov $A,0($ctx)
1225 add 16($ctx),$E
1226 mov @T[0],4($ctx)
1227 mov @T[0],$B # magic seed
1228 mov $C,8($ctx)
1229 mov $C,@T[1]
1230 mov $D,12($ctx)
1231 xor $D,@T[1]
1232 mov $E,16($ctx)
1233 and @T[1],@T[0]
1234 jmp .Loop_avx
1235
1236.align 16
1237$done_avx_label:
1238___
1239 $j=$saved_j; @V=@saved_V;
1240
1241 &Xtail_avx(\&body_20_39);
1242 &Xtail_avx(\&body_20_39);
1243 &Xtail_avx(\&body_20_39);
1244
1245$code.=<<___;
1246 vzeroupper
1247
1248 add 0($ctx),$A # update context
1249 add 4($ctx),@T[0]
1250 add 8($ctx),$C
1251 mov $A,0($ctx)
1252 add 12($ctx),$D
1253 mov @T[0],4($ctx)
1254 add 16($ctx),$E
1255 mov $C,8($ctx)
1256 mov $D,12($ctx)
1257 mov $E,16($ctx)
1258___
1259$code.=<<___ if ($win64);
Adam Langleycb1b3332017-02-09 14:17:39 -08001260 movaps -40-6*16($fp),%xmm6
1261 movaps -40-5*16($fp),%xmm7
1262 movaps -40-4*16($fp),%xmm8
1263 movaps -40-3*16($fp),%xmm9
1264 movaps -40-2*16($fp),%xmm10
1265 movaps -40-1*16($fp),%xmm11
Adam Langley95c29f32014-06-20 12:00:00 -07001266___
1267$code.=<<___;
Adam Langleycb1b3332017-02-09 14:17:39 -08001268 mov -40($fp),%r14
1269 mov -32($fp),%r13
1270 mov -24($fp),%r12
1271 mov -16($fp),%rbp
1272 mov -8($fp),%rbx
1273 lea ($fp),%rsp
Adam Langley95c29f32014-06-20 12:00:00 -07001274.Lepilogue_avx:
1275 ret
1276.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
1277___
1278
1279if ($avx>1) {
1280use integer;
1281$Xi=4; # reset variables
1282@X=map("%ymm$_",(4..7,0..3));
1283@Tx=map("%ymm$_",(8..10));
1284$Kx="%ymm11";
1285$j=0;
1286
1287my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi");
1288my ($a5,$t0)=("%r12d","%edi");
1289
1290my ($A,$F,$B,$C,$D,$E)=@ROTX;
1291my $rx=0;
1292my $frame="%r13";
1293
1294$code.=<<___;
1295.type sha1_block_data_order_avx2,\@function,3
1296.align 16
1297sha1_block_data_order_avx2:
1298_avx2_shortcut:
Adam Langleycb1b3332017-02-09 14:17:39 -08001299 mov %rsp,$fp
Adam Langley95c29f32014-06-20 12:00:00 -07001300 push %rbx
1301 push %rbp
1302 push %r12
1303 push %r13
1304 push %r14
1305 vzeroupper
1306___
1307$code.=<<___ if ($win64);
1308 lea -6*16(%rsp),%rsp
Adam Langleycb1b3332017-02-09 14:17:39 -08001309 vmovaps %xmm6,-40-6*16($fp)
1310 vmovaps %xmm7,-40-5*16($fp)
1311 vmovaps %xmm8,-40-4*16($fp)
1312 vmovaps %xmm9,-40-3*16($fp)
1313 vmovaps %xmm10,-40-2*16($fp)
1314 vmovaps %xmm11,-40-1*16($fp)
Adam Langley95c29f32014-06-20 12:00:00 -07001315.Lprologue_avx2:
1316___
1317$code.=<<___;
Adam Langley95c29f32014-06-20 12:00:00 -07001318 mov %rdi,$ctx # reassigned argument
1319 mov %rsi,$inp # reassigned argument
1320 mov %rdx,$num # reassigned argument
1321
1322 lea -640(%rsp),%rsp
1323 shl \$6,$num
1324 lea 64($inp),$frame
1325 and \$-128,%rsp
1326 add $inp,$num
1327 lea K_XX_XX+64(%rip),$K_XX_XX
1328
1329 mov 0($ctx),$A # load context
1330 cmp $num,$frame
1331 cmovae $inp,$frame # next or same block
1332 mov 4($ctx),$F
1333 mov 8($ctx),$C
1334 mov 12($ctx),$D
1335 mov 16($ctx),$E
1336 vmovdqu 64($K_XX_XX),@X[2] # pbswap mask
1337
1338 vmovdqu ($inp),%xmm0
1339 vmovdqu 16($inp),%xmm1
1340 vmovdqu 32($inp),%xmm2
1341 vmovdqu 48($inp),%xmm3
1342 lea 64($inp),$inp
1343 vinserti128 \$1,($frame),@X[-4&7],@X[-4&7]
1344 vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7]
1345 vpshufb @X[2],@X[-4&7],@X[-4&7]
1346 vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7]
1347 vpshufb @X[2],@X[-3&7],@X[-3&7]
1348 vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7]
1349 vpshufb @X[2],@X[-2&7],@X[-2&7]
1350 vmovdqu -64($K_XX_XX),$Kx # K_00_19
1351 vpshufb @X[2],@X[-1&7],@X[-1&7]
1352
1353 vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
1354 vpaddd $Kx,@X[-3&7],@X[1]
1355 vmovdqu @X[0],0(%rsp) # X[]+K xfer to IALU
1356 vpaddd $Kx,@X[-2&7],@X[2]
1357 vmovdqu @X[1],32(%rsp)
1358 vpaddd $Kx,@X[-1&7],@X[3]
1359 vmovdqu @X[2],64(%rsp)
1360 vmovdqu @X[3],96(%rsp)
1361___
1362for (;$Xi<8;$Xi++) { # Xupdate_avx2_16_31
1363 use integer;
1364
1365 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
1366 &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
1367 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
1368 &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
1369 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
1370 &vpsrld (@Tx[0],@X[0],31);
1371 &vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
1372 &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
1373 &vpaddd (@X[0],@X[0],@X[0]);
1374 &vpsrld (@Tx[1],@Tx[2],30);
1375 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
1376 &vpslld (@Tx[2],@Tx[2],2);
1377 &vpxor (@X[0],@X[0],@Tx[1]);
1378 &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
1379 &vpaddd (@Tx[1],@X[0],$Kx);
1380 &vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU
1381
1382 push(@X,shift(@X)); # "rotate" X[]
1383}
1384$code.=<<___;
1385 lea 128(%rsp),$frame
1386 jmp .Loop_avx2
1387.align 32
1388.Loop_avx2:
1389 rorx \$2,$F,$B
1390 andn $D,$F,$t0
1391 and $C,$F
1392 xor $t0,$F
1393___
1394sub bodyx_00_19 () { # 8 instructions, 3 cycles critical path
1395 # at start $f=(b&c)^(~b&d), $b>>>=2
1396 return &bodyx_20_39() if ($rx==19); $rx++;
1397 (
1398 '($a,$f,$b,$c,$d,$e)=@ROTX;'.
1399
1400 '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K
1401 '&lea ($frame,"256($frame)") if ($j%32==31);',
1402 '&andn ($t0,$a,$c)', # ~b&d for next round
1403
1404 '&add ($e,$f)', # e+=(b&c)^(~b&d)
1405 '&rorx ($a5,$a,27)', # a<<<5
1406 '&rorx ($f,$a,2)', # b>>>2 for next round
1407 '&and ($a,$b)', # b&c for next round
1408
1409 '&add ($e,$a5)', # e+=a<<<5
1410 '&xor ($a,$t0);'. # f=(b&c)^(~b&d) for next round
1411
1412 'unshift(@ROTX,pop(@ROTX)); $j++;'
1413 )
1414}
1415
1416sub bodyx_20_39 () { # 7 instructions, 2 cycles critical path
1417 # on entry $f=b^c^d, $b>>>=2
1418 return &bodyx_40_59() if ($rx==39); $rx++;
1419 (
1420 '($a,$f,$b,$c,$d,$e)=@ROTX;'.
1421
1422 '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K
1423 '&lea ($frame,"256($frame)") if ($j%32==31);',
1424
1425 '&lea ($e,"($e,$f)")', # e+=b^c^d
1426 '&rorx ($a5,$a,27)', # a<<<5
1427 '&rorx ($f,$a,2) if ($j<79)', # b>>>2 in next round
1428 '&xor ($a,$b) if ($j<79)', # b^c for next round
1429
1430 '&add ($e,$a5)', # e+=a<<<5
1431 '&xor ($a,$c) if ($j<79);'. # f=b^c^d for next round
1432
1433 'unshift(@ROTX,pop(@ROTX)); $j++;'
1434 )
1435}
1436
1437sub bodyx_40_59 () { # 10 instructions, 3 cycles critical path
1438 # on entry $f=((b^c)&(c^d)), $b>>>=2
1439 $rx++;
1440 (
1441 '($a,$f,$b,$c,$d,$e)=@ROTX;'.
1442
1443 '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K
1444 '&lea ($frame,"256($frame)") if ($j%32==31);',
1445 '&xor ($f,$c) if ($j>39)', # (b^c)&(c^d)^c
1446 '&mov ($t0,$b) if ($j<59)', # count on zero latency
1447 '&xor ($t0,$c) if ($j<59)', # c^d for next round
1448
1449 '&lea ($e,"($e,$f)")', # e+=(b^c)&(c^d)^c
1450 '&rorx ($a5,$a,27)', # a<<<5
1451 '&rorx ($f,$a,2)', # b>>>2 in next round
1452 '&xor ($a,$b)', # b^c for next round
1453
1454 '&add ($e,$a5)', # e+=a<<<5
1455 '&and ($a,$t0) if ($j< 59);'. # f=(b^c)&(c^d) for next round
1456 '&xor ($a,$c) if ($j==59);'. # f=b^c^d for next round
1457
1458 'unshift(@ROTX,pop(@ROTX)); $j++;'
1459 )
1460}
1461
1462sub Xupdate_avx2_16_31() # recall that $Xi starts wtih 4
1463{ use integer;
1464 my $body = shift;
1465 my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 instructions
1466 my ($a,$b,$c,$d,$e);
1467
1468 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
1469 eval(shift(@insns));
1470 eval(shift(@insns));
1471 eval(shift(@insns));
1472 eval(shift(@insns));
1473
1474 &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
1475 eval(shift(@insns));
1476 eval(shift(@insns));
1477 eval(shift(@insns));
1478
1479 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
1480 &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
1481 eval(shift(@insns));
1482 eval(shift(@insns));
1483
1484 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
1485 eval(shift(@insns));
1486 eval(shift(@insns));
1487 eval(shift(@insns));
1488 eval(shift(@insns));
1489
1490 &vpsrld (@Tx[0],@X[0],31);
1491 &vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
1492 eval(shift(@insns));
1493 eval(shift(@insns));
1494 eval(shift(@insns));
1495
1496 &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
1497 &vpaddd (@X[0],@X[0],@X[0]);
1498 eval(shift(@insns));
1499 eval(shift(@insns));
1500
1501 &vpsrld (@Tx[1],@Tx[2],30);
1502 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
1503 eval(shift(@insns));
1504 eval(shift(@insns));
1505
1506 &vpslld (@Tx[2],@Tx[2],2);
1507 &vpxor (@X[0],@X[0],@Tx[1]);
1508 eval(shift(@insns));
1509 eval(shift(@insns));
1510
1511 &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
1512 eval(shift(@insns));
1513 eval(shift(@insns));
1514 eval(shift(@insns));
1515
1516 &vpaddd (@Tx[1],@X[0],$Kx);
1517 eval(shift(@insns));
1518 eval(shift(@insns));
1519 eval(shift(@insns));
1520 &vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
1521
1522 foreach (@insns) { eval; } # remaining instructions [if any]
1523
1524 $Xi++;
1525 push(@X,shift(@X)); # "rotate" X[]
1526}
1527
1528sub Xupdate_avx2_32_79()
1529{ use integer;
1530 my $body = shift;
1531 my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 to 50 instructions
1532 my ($a,$b,$c,$d,$e);
1533
1534 &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
1535 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
1536 eval(shift(@insns));
1537 eval(shift(@insns));
1538
1539 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
1540 &vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0);
1541 eval(shift(@insns));
1542 eval(shift(@insns));
1543 eval(shift(@insns));
1544
1545 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
1546 eval(shift(@insns));
1547 eval(shift(@insns));
1548 eval(shift(@insns));
1549
1550 &vpsrld (@Tx[0],@X[0],30);
1551 &vpslld (@X[0],@X[0],2);
1552 eval(shift(@insns));
1553 eval(shift(@insns));
1554 eval(shift(@insns));
1555
1556 #&vpslld (@X[0],@X[0],2);
1557 eval(shift(@insns));
1558 eval(shift(@insns));
1559 eval(shift(@insns));
1560
1561 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
1562 eval(shift(@insns));
1563 eval(shift(@insns));
1564 eval(shift(@insns));
1565 eval(shift(@insns));
1566
1567 &vpaddd (@Tx[1],@X[0],$Kx);
1568 eval(shift(@insns));
1569 eval(shift(@insns));
1570 eval(shift(@insns));
1571 eval(shift(@insns));
1572
1573 &vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU
1574
1575 foreach (@insns) { eval; } # remaining instructions
1576
1577 $Xi++;
1578 push(@X,shift(@X)); # "rotate" X[]
1579}
1580
1581sub Xloop_avx2()
1582{ use integer;
1583 my $body = shift;
1584 my @insns = (&$body,&$body,&$body,&$body,&$body); # 32 instructions
1585 my ($a,$b,$c,$d,$e);
1586
1587 foreach (@insns) { eval; }
1588}
1589
1590 &align32();
1591 &Xupdate_avx2_32_79(\&bodyx_00_19);
1592 &Xupdate_avx2_32_79(\&bodyx_00_19);
1593 &Xupdate_avx2_32_79(\&bodyx_00_19);
1594 &Xupdate_avx2_32_79(\&bodyx_00_19);
1595
1596 &Xupdate_avx2_32_79(\&bodyx_20_39);
1597 &Xupdate_avx2_32_79(\&bodyx_20_39);
1598 &Xupdate_avx2_32_79(\&bodyx_20_39);
1599 &Xupdate_avx2_32_79(\&bodyx_20_39);
1600
1601 &align32();
1602 &Xupdate_avx2_32_79(\&bodyx_40_59);
1603 &Xupdate_avx2_32_79(\&bodyx_40_59);
1604 &Xupdate_avx2_32_79(\&bodyx_40_59);
1605 &Xupdate_avx2_32_79(\&bodyx_40_59);
1606
1607 &Xloop_avx2(\&bodyx_20_39);
1608 &Xloop_avx2(\&bodyx_20_39);
1609 &Xloop_avx2(\&bodyx_20_39);
1610 &Xloop_avx2(\&bodyx_20_39);
1611
1612$code.=<<___;
1613 lea 128($inp),$frame
1614 lea 128($inp),%rdi # borrow $t0
1615 cmp $num,$frame
1616 cmovae $inp,$frame # next or previous block
1617
1618 # output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
1619 add 0($ctx),@ROTX[0] # update context
1620 add 4($ctx),@ROTX[1]
1621 add 8($ctx),@ROTX[3]
1622 mov @ROTX[0],0($ctx)
1623 add 12($ctx),@ROTX[4]
1624 mov @ROTX[1],4($ctx)
1625 mov @ROTX[0],$A # A=d
1626 add 16($ctx),@ROTX[5]
1627 mov @ROTX[3],$a5
1628 mov @ROTX[3],8($ctx)
1629 mov @ROTX[4],$D # D=b
1630 #xchg @ROTX[5],$F # F=c, C=f
1631 mov @ROTX[4],12($ctx)
1632 mov @ROTX[1],$F # F=e
1633 mov @ROTX[5],16($ctx)
1634 #mov $F,16($ctx)
1635 mov @ROTX[5],$E # E=c
1636 mov $a5,$C # C=f
1637 #xchg $F,$E # E=c, F=e
1638
1639 cmp $num,$inp
1640 je .Ldone_avx2
1641___
1642
1643$Xi=4; # reset variables
1644@X=map("%ymm$_",(4..7,0..3));
1645
1646$code.=<<___;
1647 vmovdqu 64($K_XX_XX),@X[2] # pbswap mask
1648 cmp $num,%rdi # borrowed $t0
1649 ja .Last_avx2
1650
1651 vmovdqu -64(%rdi),%xmm0 # low part of @X[-4&7]
1652 vmovdqu -48(%rdi),%xmm1
1653 vmovdqu -32(%rdi),%xmm2
1654 vmovdqu -16(%rdi),%xmm3
1655 vinserti128 \$1,0($frame),@X[-4&7],@X[-4&7]
1656 vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7]
1657 vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7]
1658 vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7]
1659 jmp .Last_avx2
1660
1661.align 32
1662.Last_avx2:
1663 lea 128+16(%rsp),$frame
1664 rorx \$2,$F,$B
1665 andn $D,$F,$t0
1666 and $C,$F
1667 xor $t0,$F
1668 sub \$-128,$inp
1669___
1670 $rx=$j=0; @ROTX=($A,$F,$B,$C,$D,$E);
1671
1672 &Xloop_avx2 (\&bodyx_00_19);
1673 &Xloop_avx2 (\&bodyx_00_19);
1674 &Xloop_avx2 (\&bodyx_00_19);
1675 &Xloop_avx2 (\&bodyx_00_19);
1676
1677 &Xloop_avx2 (\&bodyx_20_39);
1678 &vmovdqu ($Kx,"-64($K_XX_XX)"); # K_00_19
1679 &vpshufb (@X[-4&7],@X[-4&7],@X[2]); # byte swap
1680 &Xloop_avx2 (\&bodyx_20_39);
1681 &vpshufb (@X[-3&7],@X[-3&7],@X[2]);
1682 &vpaddd (@Tx[0],@X[-4&7],$Kx); # add K_00_19
1683 &Xloop_avx2 (\&bodyx_20_39);
1684 &vmovdqu ("0(%rsp)",@Tx[0]);
1685 &vpshufb (@X[-2&7],@X[-2&7],@X[2]);
1686 &vpaddd (@Tx[1],@X[-3&7],$Kx);
1687 &Xloop_avx2 (\&bodyx_20_39);
1688 &vmovdqu ("32(%rsp)",@Tx[1]);
1689 &vpshufb (@X[-1&7],@X[-1&7],@X[2]);
1690 &vpaddd (@X[2],@X[-2&7],$Kx);
1691
1692 &Xloop_avx2 (\&bodyx_40_59);
1693 &align32 ();
1694 &vmovdqu ("64(%rsp)",@X[2]);
1695 &vpaddd (@X[3],@X[-1&7],$Kx);
1696 &Xloop_avx2 (\&bodyx_40_59);
1697 &vmovdqu ("96(%rsp)",@X[3]);
1698 &Xloop_avx2 (\&bodyx_40_59);
1699 &Xupdate_avx2_16_31(\&bodyx_40_59);
1700
1701 &Xupdate_avx2_16_31(\&bodyx_20_39);
1702 &Xupdate_avx2_16_31(\&bodyx_20_39);
1703 &Xupdate_avx2_16_31(\&bodyx_20_39);
1704 &Xloop_avx2 (\&bodyx_20_39);
1705
1706$code.=<<___;
1707 lea 128(%rsp),$frame
1708
1709 # output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
1710 add 0($ctx),@ROTX[0] # update context
1711 add 4($ctx),@ROTX[1]
1712 add 8($ctx),@ROTX[3]
1713 mov @ROTX[0],0($ctx)
1714 add 12($ctx),@ROTX[4]
1715 mov @ROTX[1],4($ctx)
1716 mov @ROTX[0],$A # A=d
1717 add 16($ctx),@ROTX[5]
1718 mov @ROTX[3],$a5
1719 mov @ROTX[3],8($ctx)
1720 mov @ROTX[4],$D # D=b
1721 #xchg @ROTX[5],$F # F=c, C=f
1722 mov @ROTX[4],12($ctx)
1723 mov @ROTX[1],$F # F=e
1724 mov @ROTX[5],16($ctx)
1725 #mov $F,16($ctx)
1726 mov @ROTX[5],$E # E=c
1727 mov $a5,$C # C=f
1728 #xchg $F,$E # E=c, F=e
1729
1730 cmp $num,$inp
1731 jbe .Loop_avx2
1732
1733.Ldone_avx2:
1734 vzeroupper
1735___
1736$code.=<<___ if ($win64);
Adam Langleycb1b3332017-02-09 14:17:39 -08001737 movaps -40-6*16($fp),%xmm6
1738 movaps -40-5*16($fp),%xmm7
1739 movaps -40-4*16($fp),%xmm8
1740 movaps -40-3*16($fp),%xmm9
1741 movaps -40-2*16($fp),%xmm10
1742 movaps -40-1*16($fp),%xmm11
Adam Langley95c29f32014-06-20 12:00:00 -07001743___
1744$code.=<<___;
Adam Langleycb1b3332017-02-09 14:17:39 -08001745 mov -40($fp),%r14
1746 mov -32($fp),%r13
1747 mov -24($fp),%r12
1748 mov -16($fp),%rbp
1749 mov -8($fp),%rbx
1750 lea ($fp),%rsp
Adam Langley95c29f32014-06-20 12:00:00 -07001751.Lepilogue_avx2:
1752 ret
1753.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
1754___
1755}
1756}
1757$code.=<<___;
1758.align 64
1759K_XX_XX:
1760.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
1761.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
1762.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
1763.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
1764.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
1765.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
1766.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
1767.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
1768.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
1769.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
Adam Langleycb5dd632014-06-20 12:00:00 -07001770.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
Adam Langley95c29f32014-06-20 12:00:00 -07001771___
1772}}}
1773$code.=<<___;
1774.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1775.align 64
1776___
1777
1778# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1779# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1780if ($win64) {
1781$rec="%rcx";
1782$frame="%rdx";
1783$context="%r8";
1784$disp="%r9";
1785
1786$code.=<<___;
1787.extern __imp_RtlVirtualUnwind
1788.type se_handler,\@abi-omnipotent
1789.align 16
1790se_handler:
1791 push %rsi
1792 push %rdi
1793 push %rbx
1794 push %rbp
1795 push %r12
1796 push %r13
1797 push %r14
1798 push %r15
1799 pushfq
1800 sub \$64,%rsp
1801
1802 mov 120($context),%rax # pull context->Rax
1803 mov 248($context),%rbx # pull context->Rip
1804
1805 lea .Lprologue(%rip),%r10
1806 cmp %r10,%rbx # context->Rip<.Lprologue
1807 jb .Lcommon_seh_tail
1808
1809 mov 152($context),%rax # pull context->Rsp
1810
1811 lea .Lepilogue(%rip),%r10
1812 cmp %r10,%rbx # context->Rip>=.Lepilogue
1813 jae .Lcommon_seh_tail
1814
1815 mov `16*4`(%rax),%rax # pull saved stack pointer
Adam Langley95c29f32014-06-20 12:00:00 -07001816
1817 mov -8(%rax),%rbx
1818 mov -16(%rax),%rbp
1819 mov -24(%rax),%r12
1820 mov -32(%rax),%r13
Adam Langley5c6ca972014-06-20 12:00:00 -07001821 mov -40(%rax),%r14
Adam Langley95c29f32014-06-20 12:00:00 -07001822 mov %rbx,144($context) # restore context->Rbx
1823 mov %rbp,160($context) # restore context->Rbp
1824 mov %r12,216($context) # restore context->R12
1825 mov %r13,224($context) # restore context->R13
Adam Langley5c6ca972014-06-20 12:00:00 -07001826 mov %r14,232($context) # restore context->R14
Adam Langley95c29f32014-06-20 12:00:00 -07001827
1828 jmp .Lcommon_seh_tail
1829.size se_handler,.-se_handler
Adam Langley3ffd70e2014-06-20 12:00:00 -07001830___
Adam Langley95c29f32014-06-20 12:00:00 -07001831
Adam Langley3ffd70e2014-06-20 12:00:00 -07001832$code.=<<___ if ($shaext);
Adam Langleycb5dd632014-06-20 12:00:00 -07001833.type shaext_handler,\@abi-omnipotent
1834.align 16
1835shaext_handler:
1836 push %rsi
1837 push %rdi
1838 push %rbx
1839 push %rbp
1840 push %r12
1841 push %r13
1842 push %r14
1843 push %r15
1844 pushfq
1845 sub \$64,%rsp
1846
1847 mov 120($context),%rax # pull context->Rax
1848 mov 248($context),%rbx # pull context->Rip
1849
1850 lea .Lprologue_shaext(%rip),%r10
1851 cmp %r10,%rbx # context->Rip<.Lprologue
1852 jb .Lcommon_seh_tail
1853
1854 lea .Lepilogue_shaext(%rip),%r10
1855 cmp %r10,%rbx # context->Rip>=.Lepilogue
1856 jae .Lcommon_seh_tail
1857
1858 lea -8-4*16(%rax),%rsi
1859 lea 512($context),%rdi # &context.Xmm6
1860 mov \$8,%ecx
1861 .long 0xa548f3fc # cld; rep movsq
1862
1863 jmp .Lcommon_seh_tail
1864.size shaext_handler,.-shaext_handler
Adam Langley3ffd70e2014-06-20 12:00:00 -07001865___
Adam Langleycb5dd632014-06-20 12:00:00 -07001866
Adam Langley3ffd70e2014-06-20 12:00:00 -07001867$code.=<<___;
Adam Langley95c29f32014-06-20 12:00:00 -07001868.type ssse3_handler,\@abi-omnipotent
1869.align 16
1870ssse3_handler:
1871 push %rsi
1872 push %rdi
1873 push %rbx
1874 push %rbp
1875 push %r12
1876 push %r13
1877 push %r14
1878 push %r15
1879 pushfq
1880 sub \$64,%rsp
1881
1882 mov 120($context),%rax # pull context->Rax
1883 mov 248($context),%rbx # pull context->Rip
1884
1885 mov 8($disp),%rsi # disp->ImageBase
1886 mov 56($disp),%r11 # disp->HandlerData
1887
1888 mov 0(%r11),%r10d # HandlerData[0]
1889 lea (%rsi,%r10),%r10 # prologue label
1890 cmp %r10,%rbx # context->Rip<prologue label
1891 jb .Lcommon_seh_tail
1892
Adam Langleycb1b3332017-02-09 14:17:39 -08001893 mov 208($context),%rax # pull context->R11
Adam Langley95c29f32014-06-20 12:00:00 -07001894
1895 mov 4(%r11),%r10d # HandlerData[1]
1896 lea (%rsi,%r10),%r10 # epilogue label
1897 cmp %r10,%rbx # context->Rip>=epilogue label
1898 jae .Lcommon_seh_tail
1899
Adam Langley95c29f32014-06-20 12:00:00 -07001900 lea -40-6*16(%rax),%rsi
1901 lea 512($context),%rdi # &context.Xmm6
1902 mov \$12,%ecx
1903 .long 0xa548f3fc # cld; rep movsq
1904
1905 mov -8(%rax),%rbx
1906 mov -16(%rax),%rbp
1907 mov -24(%rax),%r12
1908 mov -32(%rax),%r13
1909 mov -40(%rax),%r14
1910 mov %rbx,144($context) # restore context->Rbx
1911 mov %rbp,160($context) # restore context->Rbp
1912 mov %r12,216($context) # restore cotnext->R12
1913 mov %r13,224($context) # restore cotnext->R13
1914 mov %r14,232($context) # restore cotnext->R14
1915
1916.Lcommon_seh_tail:
1917 mov 8(%rax),%rdi
1918 mov 16(%rax),%rsi
1919 mov %rax,152($context) # restore context->Rsp
1920 mov %rsi,168($context) # restore context->Rsi
1921 mov %rdi,176($context) # restore context->Rdi
1922
1923 mov 40($disp),%rdi # disp->ContextRecord
1924 mov $context,%rsi # context
1925 mov \$154,%ecx # sizeof(CONTEXT)
1926 .long 0xa548f3fc # cld; rep movsq
1927
1928 mov $disp,%rsi
1929 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1930 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1931 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1932 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1933 mov 40(%rsi),%r10 # disp->ContextRecord
1934 lea 56(%rsi),%r11 # &disp->HandlerData
1935 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1936 mov %r10,32(%rsp) # arg5
1937 mov %r11,40(%rsp) # arg6
1938 mov %r12,48(%rsp) # arg7
1939 mov %rcx,56(%rsp) # arg8, (NULL)
1940 call *__imp_RtlVirtualUnwind(%rip)
1941
1942 mov \$1,%eax # ExceptionContinueSearch
1943 add \$64,%rsp
1944 popfq
1945 pop %r15
1946 pop %r14
1947 pop %r13
1948 pop %r12
1949 pop %rbp
1950 pop %rbx
1951 pop %rdi
1952 pop %rsi
1953 ret
1954.size ssse3_handler,.-ssse3_handler
1955
1956.section .pdata
1957.align 4
1958 .rva .LSEH_begin_sha1_block_data_order
1959 .rva .LSEH_end_sha1_block_data_order
1960 .rva .LSEH_info_sha1_block_data_order
Adam Langley006779a2014-06-20 12:00:00 -07001961___
1962$code.=<<___ if ($shaext);
Adam Langleycb5dd632014-06-20 12:00:00 -07001963 .rva .LSEH_begin_sha1_block_data_order_shaext
1964 .rva .LSEH_end_sha1_block_data_order_shaext
1965 .rva .LSEH_info_sha1_block_data_order_shaext
Adam Langley006779a2014-06-20 12:00:00 -07001966___
1967$code.=<<___;
Adam Langley95c29f32014-06-20 12:00:00 -07001968 .rva .LSEH_begin_sha1_block_data_order_ssse3
1969 .rva .LSEH_end_sha1_block_data_order_ssse3
1970 .rva .LSEH_info_sha1_block_data_order_ssse3
1971___
1972$code.=<<___ if ($avx);
1973 .rva .LSEH_begin_sha1_block_data_order_avx
1974 .rva .LSEH_end_sha1_block_data_order_avx
1975 .rva .LSEH_info_sha1_block_data_order_avx
1976___
1977$code.=<<___ if ($avx>1);
1978 .rva .LSEH_begin_sha1_block_data_order_avx2
1979 .rva .LSEH_end_sha1_block_data_order_avx2
1980 .rva .LSEH_info_sha1_block_data_order_avx2
1981___
1982$code.=<<___;
1983.section .xdata
1984.align 8
1985.LSEH_info_sha1_block_data_order:
1986 .byte 9,0,0,0
1987 .rva se_handler
Adam Langley3ffd70e2014-06-20 12:00:00 -07001988___
1989$code.=<<___ if ($shaext);
Adam Langleycb5dd632014-06-20 12:00:00 -07001990.LSEH_info_sha1_block_data_order_shaext:
1991 .byte 9,0,0,0
1992 .rva shaext_handler
Adam Langley3ffd70e2014-06-20 12:00:00 -07001993___
1994$code.=<<___;
Adam Langley95c29f32014-06-20 12:00:00 -07001995.LSEH_info_sha1_block_data_order_ssse3:
1996 .byte 9,0,0,0
1997 .rva ssse3_handler
1998 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
1999___
2000$code.=<<___ if ($avx);
2001.LSEH_info_sha1_block_data_order_avx:
2002 .byte 9,0,0,0
2003 .rva ssse3_handler
2004 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
2005___
2006$code.=<<___ if ($avx>1);
2007.LSEH_info_sha1_block_data_order_avx2:
2008 .byte 9,0,0,0
2009 .rva ssse3_handler
2010 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
2011___
2012}
2013
2014####################################################################
2015
Adam Langleycb5dd632014-06-20 12:00:00 -07002016sub sha1rnds4 {
2017 if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
2018 my @opcode=(0x0f,0x3a,0xcc);
2019 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
2020 my $c=$1;
2021 push @opcode,$c=~/^0/?oct($c):$c;
2022 return ".byte\t".join(',',@opcode);
2023 } else {
2024 return "sha1rnds4\t".@_[0];
2025 }
2026}
Adam Langley95c29f32014-06-20 12:00:00 -07002027
Adam Langleycb5dd632014-06-20 12:00:00 -07002028sub sha1op38 {
2029 my $instr = shift;
2030 my %opcodelet = (
2031 "sha1nexte" => 0xc8,
2032 "sha1msg1" => 0xc9,
2033 "sha1msg2" => 0xca );
Adam Langley95c29f32014-06-20 12:00:00 -07002034
Adam Langley956665b2014-06-20 12:00:00 -07002035 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
Adam Langleycb5dd632014-06-20 12:00:00 -07002036 my @opcode=(0x0f,0x38);
Adam Langley956665b2014-06-20 12:00:00 -07002037 my $rex=0;
2038 $rex|=0x04 if ($2>=8);
2039 $rex|=0x01 if ($1>=8);
2040 unshift @opcode,0x40|$rex if ($rex);
Adam Langleycb5dd632014-06-20 12:00:00 -07002041 push @opcode,$opcodelet{$instr};
2042 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
2043 return ".byte\t".join(',',@opcode);
2044 } else {
2045 return $instr."\t".@_[0];
2046 }
2047}
2048
2049foreach (split("\n",$code)) {
2050 s/\`([^\`]*)\`/eval $1/geo;
2051
2052 s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or
2053 s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;
2054
2055 print $_,"\n";
2056}
Adam Langley95c29f32014-06-20 12:00:00 -07002057close STDOUT;