blob: 30f12381bb77c11e94e981e335aa9e22937b3d81 [file] [log] [blame]
David Benjamind4e37952017-07-25 16:59:58 -04001#! /usr/bin/env perl
2# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
David Benjamin33d10492025-02-03 17:00:03 -05004# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# https://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
David Benjamind4e37952017-07-25 16:59:58 -040015
Adam Langley95c29f32014-06-20 12:00:00 -070016#
17# ====================================================================
18# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
David Benjamind15c0192025-01-08 13:46:55 -050019# project.
Adam Langley95c29f32014-06-20 12:00:00 -070020# ====================================================================
21#
22# sha1_block procedure for x86_64.
23#
24# It was brought to my attention that on EM64T compiler-generated code
25# was far behind 32-bit assembler implementation. This is unlike on
26# Opteron where compiler-generated code was only 15% behind 32-bit
27# assembler, which originally made it hard to motivate the effort.
28# There was suggestion to mechanically translate 32-bit code, but I
29# dismissed it, reasoning that x86_64 offers enough register bank
30# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
31# implementation:-) However! While 64-bit code does perform better
32# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
33# x86_64 does offer larger *addressable* bank, but out-of-order core
34# reaches for even more registers through dynamic aliasing, and EM64T
35# core must have managed to run-time optimize even 32-bit code just as
36# good as 64-bit one. Performance improvement is summarized in the
37# following table:
38#
39# gcc 3.4 32-bit asm cycles/byte
40# Opteron +45% +20% 6.8
41# Xeon P4 +65% +0% 9.9
42# Core2 +60% +10% 7.0
43
44# August 2009.
45#
46# The code was revised to minimize code size and to maximize
47# "distance" between instructions producing input to 'lea'
48# instruction and the 'lea' instruction itself, which is essential
49# for Intel Atom core.
50
51# October 2010.
52#
53# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
54# is to offload message schedule denoted by Wt in NIST specification,
55# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
56# for background and implementation details. The only difference from
57# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
58# to free temporary registers.
59
60# April 2011.
61#
62# Add AVX code path. See sha1-586.pl for further information.
63
64# May 2013.
65#
66# Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions
67# and loading pair of consecutive blocks to 256-bit %ymm registers)
68# did not provide impressive performance improvement till a crucial
69# hint regarding the number of Xupdate iterations to pre-compute in
70# advance was provided by Ilya Albrekht of Intel Corp.
71
Adam Langleycb5dd632014-06-20 12:00:00 -070072# March 2014.
73#
74# Add support for Intel SHA Extensions.
75
Adam Langley95c29f32014-06-20 12:00:00 -070076######################################################################
77# Current performance is summarized in following table. Numbers are
78# CPU clock cycles spent to process single byte (less is better).
79#
80# x86_64 SSSE3 AVX[2]
Adam Langley5c6ca972014-06-20 12:00:00 -070081# P4 9.05 -
82# Opteron 6.26 -
83# Core2 6.55 6.05/+8% -
84# Westmere 6.73 5.30/+27% -
85# Sandy Bridge 7.70 6.10/+26% 4.99/+54%
86# Ivy Bridge 6.06 4.67/+30% 4.60/+32%
87# Haswell 5.45 4.15/+31% 3.57/+53%
Adam Langley86c06922017-02-09 12:26:22 -080088# Skylake 5.18 4.06/+28% 3.54/+46%
Adam Langley5c6ca972014-06-20 12:00:00 -070089# Bulldozer 9.11 5.95/+53%
David Benjamind4e37952017-07-25 16:59:58 -040090# Ryzen 4.75 3.80/+24% 1.93/+150%(**)
Adam Langley5c6ca972014-06-20 12:00:00 -070091# VIA Nano 9.32 7.15/+30%
Adam Langleycb5dd632014-06-20 12:00:00 -070092# Atom 10.3 9.17/+12%
Adam Langley5c6ca972014-06-20 12:00:00 -070093# Silvermont 13.1(*) 9.37/+40%
David Benjamind4e37952017-07-25 16:59:58 -040094# Knights L 13.2(*) 9.68/+36% 8.30/+59%
Adam Langleyff7fb712017-02-09 12:34:59 -080095# Goldmont 8.13 6.42/+27% 1.70/+380%(**)
Adam Langley5c6ca972014-06-20 12:00:00 -070096#
97# (*) obviously suboptimal result, nothing was done about it,
98# because SSSE3 code is compiled unconditionally;
Adam Langleyff7fb712017-02-09 12:34:59 -080099# (**) SHAEXT result
Adam Langley95c29f32014-06-20 12:00:00 -0700100
101$flavour = shift;
102$output = shift;
103if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
104
105$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
106
107$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
108( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
Adam Langleyfd499932017-04-04 14:21:43 -0700109( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
Adam Langley95c29f32014-06-20 12:00:00 -0700110die "can't locate x86_64-xlate.pl";
111
David Benjamin278d3422015-10-14 14:03:23 -0400112# In upstream, this is controlled by shelling out to the compiler to check
113# versions, but BoringSSL is intended to be used with pre-generated perlasm
114# output, so this isn't useful anyway.
Ilya Tokar1274d1d2020-05-11 17:06:54 -0400115$avx = 2;
Ilya Tokar7361ee42020-06-19 15:51:37 -0400116$shaext=1; ### set to zero if compiling for 1.0.1
Adam Langley006779a2014-06-20 12:00:00 -0700117
David Benjaminfdd8e9c2016-06-26 13:18:50 -0400118open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
Adam Langley95c29f32014-06-20 12:00:00 -0700119*STDOUT=*OUT;
120
121$ctx="%rdi"; # 1st arg
122$inp="%rsi"; # 2nd arg
123$num="%rdx"; # 3rd arg
124
125# reassign arguments in order to produce more compact code
126$ctx="%r8";
127$inp="%r9";
128$num="%r10";
129
130$t0="%eax";
131$t1="%ebx";
132$t2="%ecx";
Adam Langley5c6ca972014-06-20 12:00:00 -0700133@xi=("%edx","%ebp","%r14d");
Adam Langley95c29f32014-06-20 12:00:00 -0700134$A="%esi";
135$B="%edi";
136$C="%r11d";
137$D="%r12d";
138$E="%r13d";
139
140@V=($A,$B,$C,$D,$E);
141
142sub BODY_00_19 {
143my ($i,$a,$b,$c,$d,$e)=@_;
144my $j=$i+1;
145$code.=<<___ if ($i==0);
146 mov `4*$i`($inp),$xi[0]
147 bswap $xi[0]
Adam Langley95c29f32014-06-20 12:00:00 -0700148___
149$code.=<<___ if ($i<15);
Adam Langley95c29f32014-06-20 12:00:00 -0700150 mov `4*$j`($inp),$xi[1]
Adam Langley5c6ca972014-06-20 12:00:00 -0700151 mov $d,$t0
152 mov $xi[0],`4*$i`(%rsp)
Adam Langley95c29f32014-06-20 12:00:00 -0700153 mov $a,$t2
Adam Langley95c29f32014-06-20 12:00:00 -0700154 bswap $xi[1]
Adam Langley5c6ca972014-06-20 12:00:00 -0700155 xor $c,$t0
Adam Langley95c29f32014-06-20 12:00:00 -0700156 rol \$5,$t2
Adam Langley95c29f32014-06-20 12:00:00 -0700157 and $b,$t0
Adam Langley5c6ca972014-06-20 12:00:00 -0700158 lea 0x5a827999($xi[0],$e),$e
Adam Langley95c29f32014-06-20 12:00:00 -0700159 add $t2,$e
160 xor $d,$t0
161 rol \$30,$b
162 add $t0,$e
163___
164$code.=<<___ if ($i>=15);
Adam Langley5c6ca972014-06-20 12:00:00 -0700165 xor `4*($j%16)`(%rsp),$xi[1]
166 mov $d,$t0
167 mov $xi[0],`4*($i%16)`(%rsp)
Adam Langley95c29f32014-06-20 12:00:00 -0700168 mov $a,$t2
169 xor `4*(($j+2)%16)`(%rsp),$xi[1]
Adam Langley5c6ca972014-06-20 12:00:00 -0700170 xor $c,$t0
Adam Langley95c29f32014-06-20 12:00:00 -0700171 rol \$5,$t2
172 xor `4*(($j+8)%16)`(%rsp),$xi[1]
173 and $b,$t0
174 lea 0x5a827999($xi[0],$e),$e
Adam Langley95c29f32014-06-20 12:00:00 -0700175 rol \$30,$b
Adam Langley5c6ca972014-06-20 12:00:00 -0700176 xor $d,$t0
177 add $t2,$e
178 rol \$1,$xi[1]
Adam Langley95c29f32014-06-20 12:00:00 -0700179 add $t0,$e
180___
Adam Langley5c6ca972014-06-20 12:00:00 -0700181push(@xi,shift(@xi));
Adam Langley95c29f32014-06-20 12:00:00 -0700182}
183
184sub BODY_20_39 {
185my ($i,$a,$b,$c,$d,$e)=@_;
186my $j=$i+1;
187my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
188$code.=<<___ if ($i<79);
Adam Langley5c6ca972014-06-20 12:00:00 -0700189 xor `4*($j%16)`(%rsp),$xi[1]
190 mov $b,$t0
191 `"mov $xi[0],".4*($i%16)."(%rsp)" if ($i<72)`
Adam Langley95c29f32014-06-20 12:00:00 -0700192 mov $a,$t2
193 xor `4*(($j+2)%16)`(%rsp),$xi[1]
Adam Langley95c29f32014-06-20 12:00:00 -0700194 xor $d,$t0
Adam Langley5c6ca972014-06-20 12:00:00 -0700195 rol \$5,$t2
196 xor `4*(($j+8)%16)`(%rsp),$xi[1]
197 lea $K($xi[0],$e),$e
198 xor $c,$t0
Adam Langley95c29f32014-06-20 12:00:00 -0700199 add $t2,$e
Adam Langley95c29f32014-06-20 12:00:00 -0700200 rol \$30,$b
201 add $t0,$e
202 rol \$1,$xi[1]
203___
Adam Langley95c29f32014-06-20 12:00:00 -0700204$code.=<<___ if ($i==79);
Adam Langley5c6ca972014-06-20 12:00:00 -0700205 mov $b,$t0
Adam Langley95c29f32014-06-20 12:00:00 -0700206 mov $a,$t2
Adam Langley5c6ca972014-06-20 12:00:00 -0700207 xor $d,$t0
Adam Langley95c29f32014-06-20 12:00:00 -0700208 lea $K($xi[0],$e),$e
209 rol \$5,$t2
Adam Langley5c6ca972014-06-20 12:00:00 -0700210 xor $c,$t0
Adam Langley95c29f32014-06-20 12:00:00 -0700211 add $t2,$e
212 rol \$30,$b
213 add $t0,$e
214___
Adam Langley5c6ca972014-06-20 12:00:00 -0700215push(@xi,shift(@xi));
Adam Langley95c29f32014-06-20 12:00:00 -0700216}
217
218sub BODY_40_59 {
219my ($i,$a,$b,$c,$d,$e)=@_;
220my $j=$i+1;
221$code.=<<___;
Adam Langley5c6ca972014-06-20 12:00:00 -0700222 xor `4*($j%16)`(%rsp),$xi[1]
223 mov $d,$t0
224 mov $xi[0],`4*($i%16)`(%rsp)
225 mov $d,$t1
Adam Langley95c29f32014-06-20 12:00:00 -0700226 xor `4*(($j+2)%16)`(%rsp),$xi[1]
Adam Langley5c6ca972014-06-20 12:00:00 -0700227 and $c,$t0
Adam Langley95c29f32014-06-20 12:00:00 -0700228 mov $a,$t2
229 xor `4*(($j+8)%16)`(%rsp),$xi[1]
Adam Langley95c29f32014-06-20 12:00:00 -0700230 lea 0x8f1bbcdc($xi[0],$e),$e
Adam Langley5c6ca972014-06-20 12:00:00 -0700231 xor $c,$t1
Adam Langley95c29f32014-06-20 12:00:00 -0700232 rol \$5,$t2
Adam Langley95c29f32014-06-20 12:00:00 -0700233 add $t0,$e
Adam Langley95c29f32014-06-20 12:00:00 -0700234 rol \$1,$xi[1]
Adam Langley5c6ca972014-06-20 12:00:00 -0700235 and $b,$t1
Adam Langley95c29f32014-06-20 12:00:00 -0700236 add $t2,$e
Adam Langley5c6ca972014-06-20 12:00:00 -0700237 rol \$30,$b
238 add $t1,$e
Adam Langley95c29f32014-06-20 12:00:00 -0700239___
Adam Langley5c6ca972014-06-20 12:00:00 -0700240push(@xi,shift(@xi));
Adam Langley95c29f32014-06-20 12:00:00 -0700241}
242
243$code.=<<___;
244.text
Adam Langley95c29f32014-06-20 12:00:00 -0700245
Brian Smith10c24cb2023-11-27 14:42:36 -0800246.globl sha1_block_data_order_nohw
247.type sha1_block_data_order_nohw,\@function,3
Adam Langley95c29f32014-06-20 12:00:00 -0700248.align 16
Brian Smith10c24cb2023-11-27 14:42:36 -0800249sha1_block_data_order_nohw:
Adam Langley6410e182018-08-07 11:26:15 -0700250.cfi_startproc
Bob Beck9fc1c332023-06-08 10:43:31 -0600251 _CET_ENDBR
Adam Langley5c6ca972014-06-20 12:00:00 -0700252 mov %rsp,%rax
Adam Langley6410e182018-08-07 11:26:15 -0700253.cfi_def_cfa_register %rax
Adam Langley95c29f32014-06-20 12:00:00 -0700254 push %rbx
Adam Langley6410e182018-08-07 11:26:15 -0700255.cfi_push %rbx
Adam Langley95c29f32014-06-20 12:00:00 -0700256 push %rbp
Adam Langley6410e182018-08-07 11:26:15 -0700257.cfi_push %rbp
Adam Langley95c29f32014-06-20 12:00:00 -0700258 push %r12
Adam Langley6410e182018-08-07 11:26:15 -0700259.cfi_push %r12
Adam Langley95c29f32014-06-20 12:00:00 -0700260 push %r13
Adam Langley6410e182018-08-07 11:26:15 -0700261.cfi_push %r13
Adam Langley5c6ca972014-06-20 12:00:00 -0700262 push %r14
Adam Langley6410e182018-08-07 11:26:15 -0700263.cfi_push %r14
Adam Langley95c29f32014-06-20 12:00:00 -0700264 mov %rdi,$ctx # reassigned argument
265 sub \$`8+16*4`,%rsp
266 mov %rsi,$inp # reassigned argument
267 and \$-64,%rsp
268 mov %rdx,$num # reassigned argument
Adam Langley5c6ca972014-06-20 12:00:00 -0700269 mov %rax,`16*4`(%rsp)
Adam Langley6410e182018-08-07 11:26:15 -0700270.cfi_cfa_expression %rsp+64,deref,+8
Adam Langley95c29f32014-06-20 12:00:00 -0700271.Lprologue:
272
273 mov 0($ctx),$A
274 mov 4($ctx),$B
275 mov 8($ctx),$C
276 mov 12($ctx),$D
277 mov 16($ctx),$E
278 jmp .Lloop
279
280.align 16
281.Lloop:
282___
283for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
284for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
285for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
286for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
287$code.=<<___;
288 add 0($ctx),$A
289 add 4($ctx),$B
290 add 8($ctx),$C
291 add 12($ctx),$D
292 add 16($ctx),$E
293 mov $A,0($ctx)
294 mov $B,4($ctx)
295 mov $C,8($ctx)
296 mov $D,12($ctx)
297 mov $E,16($ctx)
298
299 sub \$1,$num
300 lea `16*4`($inp),$inp
301 jnz .Lloop
302
303 mov `16*4`(%rsp),%rsi
Adam Langley6410e182018-08-07 11:26:15 -0700304.cfi_def_cfa %rsi,8
Adam Langley5c6ca972014-06-20 12:00:00 -0700305 mov -40(%rsi),%r14
Adam Langley6410e182018-08-07 11:26:15 -0700306.cfi_restore %r14
Adam Langley5c6ca972014-06-20 12:00:00 -0700307 mov -32(%rsi),%r13
Adam Langley6410e182018-08-07 11:26:15 -0700308.cfi_restore %r13
Adam Langley5c6ca972014-06-20 12:00:00 -0700309 mov -24(%rsi),%r12
Adam Langley6410e182018-08-07 11:26:15 -0700310.cfi_restore %r12
Adam Langley5c6ca972014-06-20 12:00:00 -0700311 mov -16(%rsi),%rbp
Adam Langley6410e182018-08-07 11:26:15 -0700312.cfi_restore %rbp
Adam Langley5c6ca972014-06-20 12:00:00 -0700313 mov -8(%rsi),%rbx
Adam Langley6410e182018-08-07 11:26:15 -0700314.cfi_restore %rbx
Adam Langley5c6ca972014-06-20 12:00:00 -0700315 lea (%rsi),%rsp
Adam Langley6410e182018-08-07 11:26:15 -0700316.cfi_def_cfa_register %rsp
Adam Langley95c29f32014-06-20 12:00:00 -0700317.Lepilogue:
318 ret
Adam Langley6410e182018-08-07 11:26:15 -0700319.cfi_endproc
Brian Smith10c24cb2023-11-27 14:42:36 -0800320.size sha1_block_data_order_nohw,.-sha1_block_data_order_nohw
Adam Langley95c29f32014-06-20 12:00:00 -0700321___
Adam Langley006779a2014-06-20 12:00:00 -0700322if ($shaext) {{{
Adam Langleycb5dd632014-06-20 12:00:00 -0700323######################################################################
324# Intel SHA Extensions implementation of SHA1 update function.
325#
326my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
327my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
328my @MSG=map("%xmm$_",(4..7));
329
330$code.=<<___;
Brian Smith10c24cb2023-11-27 14:42:36 -0800331.globl sha1_block_data_order_hw
332.type sha1_block_data_order_hw,\@function,3
Adam Langleycb5dd632014-06-20 12:00:00 -0700333.align 32
Brian Smith10c24cb2023-11-27 14:42:36 -0800334sha1_block_data_order_hw:
Adam Langley6410e182018-08-07 11:26:15 -0700335.cfi_startproc
Brian Smith10c24cb2023-11-27 14:42:36 -0800336 _CET_ENDBR
Adam Langleycb5dd632014-06-20 12:00:00 -0700337___
338$code.=<<___ if ($win64);
339 lea `-8-4*16`(%rsp),%rsp
340 movaps %xmm6,-8-4*16(%rax)
341 movaps %xmm7,-8-3*16(%rax)
342 movaps %xmm8,-8-2*16(%rax)
343 movaps %xmm9,-8-1*16(%rax)
344.Lprologue_shaext:
345___
346$code.=<<___;
347 movdqu ($ctx),$ABCD
348 movd 16($ctx),$E
349 movdqa K_XX_XX+0xa0(%rip),$BSWAP # byte-n-word swap
350
351 movdqu ($inp),@MSG[0]
352 pshufd \$0b00011011,$ABCD,$ABCD # flip word order
353 movdqu 0x10($inp),@MSG[1]
354 pshufd \$0b00011011,$E,$E # flip word order
355 movdqu 0x20($inp),@MSG[2]
356 pshufb $BSWAP,@MSG[0]
357 movdqu 0x30($inp),@MSG[3]
358 pshufb $BSWAP,@MSG[1]
359 pshufb $BSWAP,@MSG[2]
360 movdqa $E,$E_SAVE # offload $E
361 pshufb $BSWAP,@MSG[3]
362 jmp .Loop_shaext
363
364.align 16
365.Loop_shaext:
366 dec $num
David Benjamin722ba2d2016-08-16 01:54:53 -0400367 lea 0x40($inp),%r8 # next input block
Adam Langleycb5dd632014-06-20 12:00:00 -0700368 paddd @MSG[0],$E
David Benjamin722ba2d2016-08-16 01:54:53 -0400369 cmovne %r8,$inp
Ilya Tokarecb722a2023-03-06 16:20:44 -0500370 prefetcht0 512($inp)
Adam Langleycb5dd632014-06-20 12:00:00 -0700371 movdqa $ABCD,$ABCD_SAVE # offload $ABCD
372___
373for($i=0;$i<20-4;$i+=2) {
374$code.=<<___;
375 sha1msg1 @MSG[1],@MSG[0]
376 movdqa $ABCD,$E_
377 sha1rnds4 \$`int($i/5)`,$E,$ABCD # 0-3...
378 sha1nexte @MSG[1],$E_
379 pxor @MSG[2],@MSG[0]
380 sha1msg1 @MSG[2],@MSG[1]
381 sha1msg2 @MSG[3],@MSG[0]
382
383 movdqa $ABCD,$E
384 sha1rnds4 \$`int(($i+1)/5)`,$E_,$ABCD
385 sha1nexte @MSG[2],$E
386 pxor @MSG[3],@MSG[1]
387 sha1msg2 @MSG[0],@MSG[1]
388___
389 push(@MSG,shift(@MSG)); push(@MSG,shift(@MSG));
390}
391$code.=<<___;
392 movdqu ($inp),@MSG[0]
393 movdqa $ABCD,$E_
394 sha1rnds4 \$3,$E,$ABCD # 64-67
395 sha1nexte @MSG[1],$E_
396 movdqu 0x10($inp),@MSG[1]
397 pshufb $BSWAP,@MSG[0]
398
399 movdqa $ABCD,$E
400 sha1rnds4 \$3,$E_,$ABCD # 68-71
401 sha1nexte @MSG[2],$E
402 movdqu 0x20($inp),@MSG[2]
403 pshufb $BSWAP,@MSG[1]
404
405 movdqa $ABCD,$E_
406 sha1rnds4 \$3,$E,$ABCD # 72-75
407 sha1nexte @MSG[3],$E_
408 movdqu 0x30($inp),@MSG[3]
409 pshufb $BSWAP,@MSG[2]
410
411 movdqa $ABCD,$E
412 sha1rnds4 \$3,$E_,$ABCD # 76-79
413 sha1nexte $E_SAVE,$E
414 pshufb $BSWAP,@MSG[3]
415
416 paddd $ABCD_SAVE,$ABCD
417 movdqa $E,$E_SAVE # offload $E
418
419 jnz .Loop_shaext
420
421 pshufd \$0b00011011,$ABCD,$ABCD
422 pshufd \$0b00011011,$E,$E
423 movdqu $ABCD,($ctx)
424 movd $E,16($ctx)
425___
426$code.=<<___ if ($win64);
427 movaps -8-4*16(%rax),%xmm6
428 movaps -8-3*16(%rax),%xmm7
429 movaps -8-2*16(%rax),%xmm8
430 movaps -8-1*16(%rax),%xmm9
431 mov %rax,%rsp
432.Lepilogue_shaext:
433___
434$code.=<<___;
435 ret
Adam Langleye2abade2020-06-29 13:12:58 -0700436.cfi_endproc
Brian Smith10c24cb2023-11-27 14:42:36 -0800437.size sha1_block_data_order_hw,.-sha1_block_data_order_hw
Adam Langleycb5dd632014-06-20 12:00:00 -0700438___
439}}}
440{{{
Adam Langley95c29f32014-06-20 12:00:00 -0700441my $Xi=4;
442my @X=map("%xmm$_",(4..7,0..3));
443my @Tx=map("%xmm$_",(8..10));
444my $Kx="%xmm11";
445my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
446my @T=("%esi","%edi");
447my $j=0;
448my $rx=0;
Adam Langleycb1b3332017-02-09 14:17:39 -0800449my $K_XX_XX="%r14";
450my $fp="%r11";
Adam Langley95c29f32014-06-20 12:00:00 -0700451
452my $_rol=sub { &rol(@_) };
453my $_ror=sub { &ror(@_) };
454
455{ my $sn;
456sub align32() {
457 ++$sn;
458$code.=<<___;
459 jmp .Lalign32_$sn # see "Decoded ICache" in manual
460.align 32
461.Lalign32_$sn:
462___
463}
464}
465
466$code.=<<___;
Brian Smith10c24cb2023-11-27 14:42:36 -0800467.globl sha1_block_data_order_ssse3
Adam Langley95c29f32014-06-20 12:00:00 -0700468.type sha1_block_data_order_ssse3,\@function,3
469.align 16
470sha1_block_data_order_ssse3:
Adam Langley6410e182018-08-07 11:26:15 -0700471.cfi_startproc
Brian Smith10c24cb2023-11-27 14:42:36 -0800472 _CET_ENDBR
Adam Langleycb1b3332017-02-09 14:17:39 -0800473 mov %rsp,$fp # frame pointer
Adam Langley6410e182018-08-07 11:26:15 -0700474.cfi_def_cfa_register $fp
Adam Langley95c29f32014-06-20 12:00:00 -0700475 push %rbx
Adam Langley6410e182018-08-07 11:26:15 -0700476.cfi_push %rbx
Adam Langley95c29f32014-06-20 12:00:00 -0700477 push %rbp
Adam Langley6410e182018-08-07 11:26:15 -0700478.cfi_push %rbp
Adam Langley95c29f32014-06-20 12:00:00 -0700479 push %r12
Adam Langley6410e182018-08-07 11:26:15 -0700480.cfi_push %r12
Adam Langley95c29f32014-06-20 12:00:00 -0700481 push %r13 # redundant, done to share Win64 SE handler
Adam Langley6410e182018-08-07 11:26:15 -0700482.cfi_push %r13
Adam Langley95c29f32014-06-20 12:00:00 -0700483 push %r14
Adam Langley6410e182018-08-07 11:26:15 -0700484.cfi_push %r14
Adam Langley95c29f32014-06-20 12:00:00 -0700485 lea `-64-($win64?6*16:0)`(%rsp),%rsp
486___
487$code.=<<___ if ($win64);
Adam Langleycb1b3332017-02-09 14:17:39 -0800488 movaps %xmm6,-40-6*16($fp)
489 movaps %xmm7,-40-5*16($fp)
490 movaps %xmm8,-40-4*16($fp)
491 movaps %xmm9,-40-3*16($fp)
492 movaps %xmm10,-40-2*16($fp)
493 movaps %xmm11,-40-1*16($fp)
Adam Langley95c29f32014-06-20 12:00:00 -0700494.Lprologue_ssse3:
495___
496$code.=<<___;
Adam Langley95c29f32014-06-20 12:00:00 -0700497 and \$-64,%rsp
498 mov %rdi,$ctx # reassigned argument
499 mov %rsi,$inp # reassigned argument
500 mov %rdx,$num # reassigned argument
501
502 shl \$6,$num
503 add $inp,$num
504 lea K_XX_XX+64(%rip),$K_XX_XX
505
506 mov 0($ctx),$A # load context
507 mov 4($ctx),$B
508 mov 8($ctx),$C
509 mov 12($ctx),$D
510 mov $B,@T[0] # magic seed
511 mov 16($ctx),$E
512 mov $C,@T[1]
513 xor $D,@T[1]
514 and @T[1],@T[0]
515
516 movdqa 64($K_XX_XX),@X[2] # pbswap mask
517 movdqa -64($K_XX_XX),@Tx[1] # K_00_19
518 movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
519 movdqu 16($inp),@X[-3&7]
520 movdqu 32($inp),@X[-2&7]
521 movdqu 48($inp),@X[-1&7]
522 pshufb @X[2],@X[-4&7] # byte swap
Adam Langley95c29f32014-06-20 12:00:00 -0700523 pshufb @X[2],@X[-3&7]
524 pshufb @X[2],@X[-2&7]
Adam Langley5c6ca972014-06-20 12:00:00 -0700525 add \$64,$inp
Adam Langley95c29f32014-06-20 12:00:00 -0700526 paddd @Tx[1],@X[-4&7] # add K_00_19
Adam Langley5c6ca972014-06-20 12:00:00 -0700527 pshufb @X[2],@X[-1&7]
Adam Langley95c29f32014-06-20 12:00:00 -0700528 paddd @Tx[1],@X[-3&7]
529 paddd @Tx[1],@X[-2&7]
530 movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
531 psubd @Tx[1],@X[-4&7] # restore X[]
532 movdqa @X[-3&7],16(%rsp)
533 psubd @Tx[1],@X[-3&7]
534 movdqa @X[-2&7],32(%rsp)
535 psubd @Tx[1],@X[-2&7]
536 jmp .Loop_ssse3
537___
538
539sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
540{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
541 my $arg = pop;
542 $arg = "\$$arg" if ($arg*1 eq $arg);
543 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
544}
545
David Benjamind4e37952017-07-25 16:59:58 -0400546sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4
Adam Langley95c29f32014-06-20 12:00:00 -0700547{ use integer;
548 my $body = shift;
549 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
550 my ($a,$b,$c,$d,$e);
551
Adam Langley5c6ca972014-06-20 12:00:00 -0700552 eval(shift(@insns)); # ror
553 &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
Adam Langley95c29f32014-06-20 12:00:00 -0700554 eval(shift(@insns));
555 &movdqa (@Tx[0],@X[-1&7]);
Adam Langley5c6ca972014-06-20 12:00:00 -0700556 &paddd (@Tx[1],@X[-1&7]);
Adam Langley95c29f32014-06-20 12:00:00 -0700557 eval(shift(@insns));
558 eval(shift(@insns));
559
Adam Langley5c6ca972014-06-20 12:00:00 -0700560 &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
Adam Langley95c29f32014-06-20 12:00:00 -0700561 eval(shift(@insns));
Adam Langley5c6ca972014-06-20 12:00:00 -0700562 eval(shift(@insns)); # rol
Adam Langley95c29f32014-06-20 12:00:00 -0700563 eval(shift(@insns));
564 &psrldq (@Tx[0],4); # "X[-3]", 3 dwords
565 eval(shift(@insns));
566 eval(shift(@insns));
Adam Langley5c6ca972014-06-20 12:00:00 -0700567
Adam Langley95c29f32014-06-20 12:00:00 -0700568 &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
569 eval(shift(@insns));
Adam Langley5c6ca972014-06-20 12:00:00 -0700570 eval(shift(@insns)); # ror
Adam Langley95c29f32014-06-20 12:00:00 -0700571 &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
572 eval(shift(@insns));
573 eval(shift(@insns));
574 eval(shift(@insns));
Adam Langley95c29f32014-06-20 12:00:00 -0700575
576 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
577 eval(shift(@insns));
Adam Langley5c6ca972014-06-20 12:00:00 -0700578 eval(shift(@insns)); # rol
Adam Langley95c29f32014-06-20 12:00:00 -0700579 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
580 eval(shift(@insns));
581 eval(shift(@insns));
582
583 &movdqa (@Tx[2],@X[0]);
Adam Langley5c6ca972014-06-20 12:00:00 -0700584 eval(shift(@insns));
585 eval(shift(@insns));
586 eval(shift(@insns)); # ror
Adam Langley95c29f32014-06-20 12:00:00 -0700587 &movdqa (@Tx[0],@X[0]);
588 eval(shift(@insns));
Adam Langley95c29f32014-06-20 12:00:00 -0700589
590 &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
591 &paddd (@X[0],@X[0]);
592 eval(shift(@insns));
593 eval(shift(@insns));
Adam Langley95c29f32014-06-20 12:00:00 -0700594
595 &psrld (@Tx[0],31);
596 eval(shift(@insns));
Adam Langley5c6ca972014-06-20 12:00:00 -0700597 eval(shift(@insns)); # rol
Adam Langley95c29f32014-06-20 12:00:00 -0700598 eval(shift(@insns));
599 &movdqa (@Tx[1],@Tx[2]);
600 eval(shift(@insns));
601 eval(shift(@insns));
602
603 &psrld (@Tx[2],30);
Adam Langley95c29f32014-06-20 12:00:00 -0700604 eval(shift(@insns));
Adam Langley5c6ca972014-06-20 12:00:00 -0700605 eval(shift(@insns)); # ror
606 &por (@X[0],@Tx[0]); # "X[0]"<<<=1
Adam Langley95c29f32014-06-20 12:00:00 -0700607 eval(shift(@insns));
608 eval(shift(@insns));
609 eval(shift(@insns));
610
611 &pslld (@Tx[1],2);
612 &pxor (@X[0],@Tx[2]);
613 eval(shift(@insns));
Adam Langley95c29f32014-06-20 12:00:00 -0700614 &movdqa (@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)"); # K_XX_XX
Adam Langley5c6ca972014-06-20 12:00:00 -0700615 eval(shift(@insns)); # rol
Adam Langley95c29f32014-06-20 12:00:00 -0700616 eval(shift(@insns));
617 eval(shift(@insns));
618
619 &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
Adam Langley5c6ca972014-06-20 12:00:00 -0700620 &pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
Adam Langley95c29f32014-06-20 12:00:00 -0700621
622 foreach (@insns) { eval; } # remaining instructions [if any]
623
624 $Xi++; push(@X,shift(@X)); # "rotate" X[]
625 push(@Tx,shift(@Tx));
626}
627
628sub Xupdate_ssse3_32_79()
629{ use integer;
630 my $body = shift;
631 my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
632 my ($a,$b,$c,$d,$e);
633
Adam Langley5c6ca972014-06-20 12:00:00 -0700634 eval(shift(@insns)) if ($Xi==8);
Adam Langley95c29f32014-06-20 12:00:00 -0700635 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
Adam Langley5c6ca972014-06-20 12:00:00 -0700636 eval(shift(@insns)) if ($Xi==8);
637 eval(shift(@insns)); # body_20_39
Adam Langley95c29f32014-06-20 12:00:00 -0700638 eval(shift(@insns));
Adam Langley5c6ca972014-06-20 12:00:00 -0700639 eval(shift(@insns)) if (@insns[1] =~ /_ror/);
640 eval(shift(@insns)) if (@insns[0] =~ /_ror/);
641 &punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
Adam Langley95c29f32014-06-20 12:00:00 -0700642 eval(shift(@insns));
643 eval(shift(@insns)); # rol
644
645 &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
646 eval(shift(@insns));
Adam Langley5c6ca972014-06-20 12:00:00 -0700647 eval(shift(@insns));
Adam Langley95c29f32014-06-20 12:00:00 -0700648 if ($Xi%5) {
649 &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
650 } else { # ... or load next one
651 &movdqa (@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
652 }
Adam Langley95c29f32014-06-20 12:00:00 -0700653 eval(shift(@insns)); # ror
Adam Langley5c6ca972014-06-20 12:00:00 -0700654 &paddd (@Tx[1],@X[-1&7]);
Adam Langley95c29f32014-06-20 12:00:00 -0700655 eval(shift(@insns));
656
657 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
658 eval(shift(@insns)); # body_20_39
659 eval(shift(@insns));
660 eval(shift(@insns));
661 eval(shift(@insns)); # rol
Adam Langley5c6ca972014-06-20 12:00:00 -0700662 eval(shift(@insns)) if (@insns[0] =~ /_ror/);
Adam Langley95c29f32014-06-20 12:00:00 -0700663
664 &movdqa (@Tx[0],@X[0]);
Adam Langley5c6ca972014-06-20 12:00:00 -0700665 eval(shift(@insns));
666 eval(shift(@insns));
Adam Langley95c29f32014-06-20 12:00:00 -0700667 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
Adam Langley95c29f32014-06-20 12:00:00 -0700668 eval(shift(@insns)); # ror
669 eval(shift(@insns));
Adam Langley5c6ca972014-06-20 12:00:00 -0700670 eval(shift(@insns)); # body_20_39
Adam Langley95c29f32014-06-20 12:00:00 -0700671
672 &pslld (@X[0],2);
Adam Langley5c6ca972014-06-20 12:00:00 -0700673 eval(shift(@insns));
Adam Langley95c29f32014-06-20 12:00:00 -0700674 eval(shift(@insns));
675 &psrld (@Tx[0],30);
Adam Langley5c6ca972014-06-20 12:00:00 -0700676 eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol
Adam Langley95c29f32014-06-20 12:00:00 -0700677 eval(shift(@insns));
678 eval(shift(@insns));
679 eval(shift(@insns)); # ror
Adam Langley95c29f32014-06-20 12:00:00 -0700680
681 &por (@X[0],@Tx[0]); # "X[0]"<<<=2
Adam Langley95c29f32014-06-20 12:00:00 -0700682 eval(shift(@insns));
Adam Langley5c6ca972014-06-20 12:00:00 -0700683 eval(shift(@insns)); # body_20_39
684 eval(shift(@insns)) if (@insns[1] =~ /_rol/);
685 eval(shift(@insns)) if (@insns[0] =~ /_rol/);
686 &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0])
Adam Langley95c29f32014-06-20 12:00:00 -0700687 eval(shift(@insns));
688 eval(shift(@insns)); # rol
689 eval(shift(@insns));
690 eval(shift(@insns));
691 eval(shift(@insns)); # rol
692 eval(shift(@insns));
693
694 foreach (@insns) { eval; } # remaining instructions
695
696 $Xi++; push(@X,shift(@X)); # "rotate" X[]
697 push(@Tx,shift(@Tx));
698}
699
700sub Xuplast_ssse3_80()
701{ use integer;
702 my $body = shift;
703 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
704 my ($a,$b,$c,$d,$e);
705
706 eval(shift(@insns));
Adam Langley5c6ca972014-06-20 12:00:00 -0700707 eval(shift(@insns));
708 eval(shift(@insns));
709 eval(shift(@insns));
Adam Langley95c29f32014-06-20 12:00:00 -0700710 &paddd (@Tx[1],@X[-1&7]);
711 eval(shift(@insns));
712 eval(shift(@insns));
Adam Langley95c29f32014-06-20 12:00:00 -0700713
714 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
715
716 foreach (@insns) { eval; } # remaining instructions
717
718 &cmp ($inp,$num);
719 &je (".Ldone_ssse3");
720
721 unshift(@Tx,pop(@Tx));
722
723 &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask
724 &movdqa (@Tx[1],"-64($K_XX_XX)"); # K_00_19
725 &movdqu (@X[-4&7],"0($inp)"); # load input
726 &movdqu (@X[-3&7],"16($inp)");
727 &movdqu (@X[-2&7],"32($inp)");
728 &movdqu (@X[-1&7],"48($inp)");
729 &pshufb (@X[-4&7],@X[2]); # byte swap
730 &add ($inp,64);
731
732 $Xi=0;
733}
734
735sub Xloop_ssse3()
736{ use integer;
737 my $body = shift;
738 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
739 my ($a,$b,$c,$d,$e);
740
741 eval(shift(@insns));
742 eval(shift(@insns));
Adam Langley5c6ca972014-06-20 12:00:00 -0700743 eval(shift(@insns));
Adam Langley95c29f32014-06-20 12:00:00 -0700744 &pshufb (@X[($Xi-3)&7],@X[2]);
745 eval(shift(@insns));
746 eval(shift(@insns));
747 eval(shift(@insns));
Adam Langley5c6ca972014-06-20 12:00:00 -0700748 eval(shift(@insns));
Adam Langley95c29f32014-06-20 12:00:00 -0700749 &paddd (@X[($Xi-4)&7],@Tx[1]);
750 eval(shift(@insns));
751 eval(shift(@insns));
752 eval(shift(@insns));
753 eval(shift(@insns));
754 &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
755 eval(shift(@insns));
756 eval(shift(@insns));
Adam Langley5c6ca972014-06-20 12:00:00 -0700757 eval(shift(@insns));
758 eval(shift(@insns));
Adam Langley95c29f32014-06-20 12:00:00 -0700759 &psubd (@X[($Xi-4)&7],@Tx[1]);
760
761 foreach (@insns) { eval; }
762 $Xi++;
763}
764
765sub Xtail_ssse3()
766{ use integer;
767 my $body = shift;
768 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
769 my ($a,$b,$c,$d,$e);
770
771 foreach (@insns) { eval; }
772}
773
774sub body_00_19 () { # ((c^d)&b)^d
775 # on start @T[0]=(c^d)&b
776 return &body_20_39() if ($rx==19); $rx++;
777 (
778 '($a,$b,$c,$d,$e)=@V;'.
779 '&$_ror ($b,$j?7:2)', # $b>>>2
780 '&xor (@T[0],$d)',
781 '&mov (@T[1],$a)', # $b for next round
782
783 '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
784 '&xor ($b,$c)', # $c^$d for next round
785
786 '&$_rol ($a,5)',
787 '&add ($e,@T[0])',
788 '&and (@T[1],$b)', # ($b&($c^$d)) for next round
789
790 '&xor ($b,$c)', # restore $b
791 '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
792 );
793}
794
795sub body_20_39 () { # b^d^c
796 # on entry @T[0]=b^d
797 return &body_40_59() if ($rx==39); $rx++;
798 (
799 '($a,$b,$c,$d,$e)=@V;'.
800 '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
801 '&xor (@T[0],$d) if($j==19);'.
802 '&xor (@T[0],$c) if($j> 19)', # ($b^$d^$c)
803 '&mov (@T[1],$a)', # $b for next round
804
805 '&$_rol ($a,5)',
806 '&add ($e,@T[0])',
807 '&xor (@T[1],$c) if ($j< 79)', # $b^$d for next round
808
809 '&$_ror ($b,7)', # $b>>>2
810 '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
811 );
812}
813
814sub body_40_59 () { # ((b^c)&(c^d))^c
815 # on entry @T[0]=(b^c), (c^=d)
816 $rx++;
817 (
818 '($a,$b,$c,$d,$e)=@V;'.
819 '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
820 '&and (@T[0],$c) if ($j>=40)', # (b^c)&(c^d)
821 '&xor ($c,$d) if ($j>=40)', # restore $c
822
823 '&$_ror ($b,7)', # $b>>>2
824 '&mov (@T[1],$a)', # $b for next round
825 '&xor (@T[0],$c)',
826
827 '&$_rol ($a,5)',
828 '&add ($e,@T[0])',
829 '&xor (@T[1],$c) if ($j==59);'.
830 '&xor (@T[1],$b) if ($j< 59)', # b^c for next round
831
832 '&xor ($b,$c) if ($j< 59)', # c^d for next round
833 '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
834 );
835}
836$code.=<<___;
837.align 16
838.Loop_ssse3:
839___
840 &Xupdate_ssse3_16_31(\&body_00_19);
841 &Xupdate_ssse3_16_31(\&body_00_19);
842 &Xupdate_ssse3_16_31(\&body_00_19);
843 &Xupdate_ssse3_16_31(\&body_00_19);
844 &Xupdate_ssse3_32_79(\&body_00_19);
845 &Xupdate_ssse3_32_79(\&body_20_39);
846 &Xupdate_ssse3_32_79(\&body_20_39);
847 &Xupdate_ssse3_32_79(\&body_20_39);
848 &Xupdate_ssse3_32_79(\&body_20_39);
849 &Xupdate_ssse3_32_79(\&body_20_39);
850 &Xupdate_ssse3_32_79(\&body_40_59);
851 &Xupdate_ssse3_32_79(\&body_40_59);
852 &Xupdate_ssse3_32_79(\&body_40_59);
853 &Xupdate_ssse3_32_79(\&body_40_59);
854 &Xupdate_ssse3_32_79(\&body_40_59);
855 &Xupdate_ssse3_32_79(\&body_20_39);
856 &Xuplast_ssse3_80(\&body_20_39); # can jump to "done"
857
858 $saved_j=$j; @saved_V=@V;
859
860 &Xloop_ssse3(\&body_20_39);
861 &Xloop_ssse3(\&body_20_39);
862 &Xloop_ssse3(\&body_20_39);
863
864$code.=<<___;
865 add 0($ctx),$A # update context
866 add 4($ctx),@T[0]
867 add 8($ctx),$C
868 add 12($ctx),$D
869 mov $A,0($ctx)
870 add 16($ctx),$E
871 mov @T[0],4($ctx)
872 mov @T[0],$B # magic seed
873 mov $C,8($ctx)
874 mov $C,@T[1]
875 mov $D,12($ctx)
876 xor $D,@T[1]
877 mov $E,16($ctx)
878 and @T[1],@T[0]
879 jmp .Loop_ssse3
880
881.align 16
882.Ldone_ssse3:
883___
884 $j=$saved_j; @V=@saved_V;
885
886 &Xtail_ssse3(\&body_20_39);
887 &Xtail_ssse3(\&body_20_39);
888 &Xtail_ssse3(\&body_20_39);
889
890$code.=<<___;
891 add 0($ctx),$A # update context
892 add 4($ctx),@T[0]
893 add 8($ctx),$C
894 mov $A,0($ctx)
895 add 12($ctx),$D
896 mov @T[0],4($ctx)
897 add 16($ctx),$E
898 mov $C,8($ctx)
899 mov $D,12($ctx)
900 mov $E,16($ctx)
901___
902$code.=<<___ if ($win64);
Adam Langleycb1b3332017-02-09 14:17:39 -0800903 movaps -40-6*16($fp),%xmm6
904 movaps -40-5*16($fp),%xmm7
905 movaps -40-4*16($fp),%xmm8
906 movaps -40-3*16($fp),%xmm9
907 movaps -40-2*16($fp),%xmm10
908 movaps -40-1*16($fp),%xmm11
Adam Langley95c29f32014-06-20 12:00:00 -0700909___
910$code.=<<___;
Adam Langleycb1b3332017-02-09 14:17:39 -0800911 mov -40($fp),%r14
Adam Langley6410e182018-08-07 11:26:15 -0700912.cfi_restore %r14
Adam Langleycb1b3332017-02-09 14:17:39 -0800913 mov -32($fp),%r13
Adam Langley6410e182018-08-07 11:26:15 -0700914.cfi_restore %r13
Adam Langleycb1b3332017-02-09 14:17:39 -0800915 mov -24($fp),%r12
Adam Langley6410e182018-08-07 11:26:15 -0700916.cfi_restore %r12
Adam Langleycb1b3332017-02-09 14:17:39 -0800917 mov -16($fp),%rbp
Adam Langley6410e182018-08-07 11:26:15 -0700918.cfi_restore %rbp
Adam Langleycb1b3332017-02-09 14:17:39 -0800919 mov -8($fp),%rbx
Adam Langley6410e182018-08-07 11:26:15 -0700920.cfi_restore %rbx
Adam Langleycb1b3332017-02-09 14:17:39 -0800921 lea ($fp),%rsp
Adam Langley6410e182018-08-07 11:26:15 -0700922.cfi_def_cfa_register %rsp
Adam Langley95c29f32014-06-20 12:00:00 -0700923.Lepilogue_ssse3:
924 ret
Adam Langley6410e182018-08-07 11:26:15 -0700925.cfi_endproc
Adam Langley95c29f32014-06-20 12:00:00 -0700926.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
927___
928
929if ($avx) {
930$Xi=4; # reset variables
931@X=map("%xmm$_",(4..7,0..3));
932@Tx=map("%xmm$_",(8..10));
933$j=0;
934$rx=0;
935
936my $done_avx_label=".Ldone_avx";
937
938my $_rol=sub { &shld(@_[0],@_) };
939my $_ror=sub { &shrd(@_[0],@_) };
940
941$code.=<<___;
Brian Smith10c24cb2023-11-27 14:42:36 -0800942.globl sha1_block_data_order_avx
Adam Langley95c29f32014-06-20 12:00:00 -0700943.type sha1_block_data_order_avx,\@function,3
944.align 16
945sha1_block_data_order_avx:
Adam Langley6410e182018-08-07 11:26:15 -0700946.cfi_startproc
Brian Smith10c24cb2023-11-27 14:42:36 -0800947 _CET_ENDBR
Adam Langleycb1b3332017-02-09 14:17:39 -0800948 mov %rsp,$fp
Adam Langley6410e182018-08-07 11:26:15 -0700949.cfi_def_cfa_register $fp
Adam Langley95c29f32014-06-20 12:00:00 -0700950 push %rbx
Adam Langley6410e182018-08-07 11:26:15 -0700951.cfi_push %rbx
Adam Langley95c29f32014-06-20 12:00:00 -0700952 push %rbp
Adam Langley6410e182018-08-07 11:26:15 -0700953.cfi_push %rbp
Adam Langley95c29f32014-06-20 12:00:00 -0700954 push %r12
Adam Langley6410e182018-08-07 11:26:15 -0700955.cfi_push %r12
Adam Langley95c29f32014-06-20 12:00:00 -0700956 push %r13 # redundant, done to share Win64 SE handler
Adam Langley6410e182018-08-07 11:26:15 -0700957.cfi_push %r13
Adam Langley95c29f32014-06-20 12:00:00 -0700958 push %r14
Adam Langley6410e182018-08-07 11:26:15 -0700959.cfi_push %r14
Adam Langley95c29f32014-06-20 12:00:00 -0700960 lea `-64-($win64?6*16:0)`(%rsp),%rsp
961 vzeroupper
962___
963$code.=<<___ if ($win64);
Adam Langleycb1b3332017-02-09 14:17:39 -0800964 vmovaps %xmm6,-40-6*16($fp)
965 vmovaps %xmm7,-40-5*16($fp)
966 vmovaps %xmm8,-40-4*16($fp)
967 vmovaps %xmm9,-40-3*16($fp)
968 vmovaps %xmm10,-40-2*16($fp)
969 vmovaps %xmm11,-40-1*16($fp)
Adam Langley95c29f32014-06-20 12:00:00 -0700970.Lprologue_avx:
971___
972$code.=<<___;
Adam Langley95c29f32014-06-20 12:00:00 -0700973 and \$-64,%rsp
974 mov %rdi,$ctx # reassigned argument
975 mov %rsi,$inp # reassigned argument
976 mov %rdx,$num # reassigned argument
977
978 shl \$6,$num
979 add $inp,$num
980 lea K_XX_XX+64(%rip),$K_XX_XX
981
982 mov 0($ctx),$A # load context
983 mov 4($ctx),$B
984 mov 8($ctx),$C
985 mov 12($ctx),$D
986 mov $B,@T[0] # magic seed
987 mov 16($ctx),$E
988 mov $C,@T[1]
989 xor $D,@T[1]
990 and @T[1],@T[0]
991
992 vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
993 vmovdqa -64($K_XX_XX),$Kx # K_00_19
994 vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
995 vmovdqu 16($inp),@X[-3&7]
996 vmovdqu 32($inp),@X[-2&7]
997 vmovdqu 48($inp),@X[-1&7]
998 vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
999 add \$64,$inp
1000 vpshufb @X[2],@X[-3&7],@X[-3&7]
1001 vpshufb @X[2],@X[-2&7],@X[-2&7]
1002 vpshufb @X[2],@X[-1&7],@X[-1&7]
1003 vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
1004 vpaddd $Kx,@X[-3&7],@X[1]
1005 vpaddd $Kx,@X[-2&7],@X[2]
1006 vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
1007 vmovdqa @X[1],16(%rsp)
1008 vmovdqa @X[2],32(%rsp)
1009 jmp .Loop_avx
1010___
1011
Adam Langley6410e182018-08-07 11:26:15 -07001012sub Xupdate_avx_16_31() # recall that $Xi starts with 4
Adam Langley95c29f32014-06-20 12:00:00 -07001013{ use integer;
1014 my $body = shift;
1015 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
1016 my ($a,$b,$c,$d,$e);
1017
1018 eval(shift(@insns));
1019 eval(shift(@insns));
1020 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
1021 eval(shift(@insns));
1022 eval(shift(@insns));
1023
1024 &vpaddd (@Tx[1],$Kx,@X[-1&7]);
1025 eval(shift(@insns));
1026 eval(shift(@insns));
1027 &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
1028 eval(shift(@insns));
1029 eval(shift(@insns));
1030 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
1031 eval(shift(@insns));
1032 eval(shift(@insns));
1033
1034 &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
1035 eval(shift(@insns));
1036 eval(shift(@insns));
1037 eval(shift(@insns));
1038 eval(shift(@insns));
1039
1040 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
1041 eval(shift(@insns));
1042 eval(shift(@insns));
1043 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
1044 eval(shift(@insns));
1045 eval(shift(@insns));
1046
1047 &vpsrld (@Tx[0],@X[0],31);
1048 eval(shift(@insns));
1049 eval(shift(@insns));
1050 eval(shift(@insns));
1051 eval(shift(@insns));
1052
1053 &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
1054 &vpaddd (@X[0],@X[0],@X[0]);
1055 eval(shift(@insns));
1056 eval(shift(@insns));
1057 eval(shift(@insns));
1058 eval(shift(@insns));
1059
1060 &vpsrld (@Tx[1],@Tx[2],30);
1061 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
1062 eval(shift(@insns));
1063 eval(shift(@insns));
1064 eval(shift(@insns));
1065 eval(shift(@insns));
1066
1067 &vpslld (@Tx[2],@Tx[2],2);
1068 &vpxor (@X[0],@X[0],@Tx[1]);
1069 eval(shift(@insns));
1070 eval(shift(@insns));
1071 eval(shift(@insns));
1072 eval(shift(@insns));
1073
1074 &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
1075 eval(shift(@insns));
1076 eval(shift(@insns));
1077 &vmovdqa ($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
1078 eval(shift(@insns));
1079 eval(shift(@insns));
1080
1081
1082 foreach (@insns) { eval; } # remaining instructions [if any]
1083
1084 $Xi++; push(@X,shift(@X)); # "rotate" X[]
1085}
1086
1087sub Xupdate_avx_32_79()
1088{ use integer;
1089 my $body = shift;
1090 my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
1091 my ($a,$b,$c,$d,$e);
1092
1093 &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
1094 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
1095 eval(shift(@insns)); # body_20_39
1096 eval(shift(@insns));
1097 eval(shift(@insns));
1098 eval(shift(@insns)); # rol
1099
1100 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
1101 eval(shift(@insns));
1102 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
1103 &vpaddd (@Tx[1],$Kx,@X[-1&7]);
1104 &vmovdqa ($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0);
1105 eval(shift(@insns)); # ror
1106 eval(shift(@insns));
1107
1108 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
1109 eval(shift(@insns)); # body_20_39
1110 eval(shift(@insns));
1111 eval(shift(@insns));
1112 eval(shift(@insns)); # rol
1113
1114 &vpsrld (@Tx[0],@X[0],30);
1115 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
1116 eval(shift(@insns));
1117 eval(shift(@insns));
1118 eval(shift(@insns)); # ror
1119 eval(shift(@insns));
1120
1121 &vpslld (@X[0],@X[0],2);
1122 eval(shift(@insns)); # body_20_39
1123 eval(shift(@insns));
1124 eval(shift(@insns));
1125 eval(shift(@insns)); # rol
1126 eval(shift(@insns));
1127 eval(shift(@insns));
1128 eval(shift(@insns)); # ror
1129 eval(shift(@insns));
1130
1131 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
1132 eval(shift(@insns)); # body_20_39
1133 eval(shift(@insns));
1134 eval(shift(@insns));
1135 eval(shift(@insns)); # rol
1136 eval(shift(@insns));
1137 eval(shift(@insns));
1138 eval(shift(@insns)); # rol
1139 eval(shift(@insns));
1140
1141 foreach (@insns) { eval; } # remaining instructions
1142
1143 $Xi++; push(@X,shift(@X)); # "rotate" X[]
1144}
1145
1146sub Xuplast_avx_80()
1147{ use integer;
1148 my $body = shift;
1149 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
1150 my ($a,$b,$c,$d,$e);
1151
1152 eval(shift(@insns));
1153 &vpaddd (@Tx[1],$Kx,@X[-1&7]);
1154 eval(shift(@insns));
1155 eval(shift(@insns));
1156 eval(shift(@insns));
1157 eval(shift(@insns));
1158
1159 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
1160
1161 foreach (@insns) { eval; } # remaining instructions
1162
1163 &cmp ($inp,$num);
1164 &je ($done_avx_label);
1165
1166 &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask
1167 &vmovdqa($Kx,"-64($K_XX_XX)"); # K_00_19
1168 &vmovdqu(@X[-4&7],"0($inp)"); # load input
1169 &vmovdqu(@X[-3&7],"16($inp)");
1170 &vmovdqu(@X[-2&7],"32($inp)");
1171 &vmovdqu(@X[-1&7],"48($inp)");
1172 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
1173 &add ($inp,64);
1174
1175 $Xi=0;
1176}
1177
1178sub Xloop_avx()
1179{ use integer;
1180 my $body = shift;
1181 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
1182 my ($a,$b,$c,$d,$e);
1183
1184 eval(shift(@insns));
1185 eval(shift(@insns));
1186 &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
1187 eval(shift(@insns));
1188 eval(shift(@insns));
1189 &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],$Kx);
1190 eval(shift(@insns));
1191 eval(shift(@insns));
1192 eval(shift(@insns));
1193 eval(shift(@insns));
1194 &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU
1195 eval(shift(@insns));
1196 eval(shift(@insns));
1197
1198 foreach (@insns) { eval; }
1199 $Xi++;
1200}
1201
1202sub Xtail_avx()
1203{ use integer;
1204 my $body = shift;
1205 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
1206 my ($a,$b,$c,$d,$e);
1207
1208 foreach (@insns) { eval; }
1209}
1210
1211$code.=<<___;
1212.align 16
1213.Loop_avx:
1214___
1215 &Xupdate_avx_16_31(\&body_00_19);
1216 &Xupdate_avx_16_31(\&body_00_19);
1217 &Xupdate_avx_16_31(\&body_00_19);
1218 &Xupdate_avx_16_31(\&body_00_19);
1219 &Xupdate_avx_32_79(\&body_00_19);
1220 &Xupdate_avx_32_79(\&body_20_39);
1221 &Xupdate_avx_32_79(\&body_20_39);
1222 &Xupdate_avx_32_79(\&body_20_39);
1223 &Xupdate_avx_32_79(\&body_20_39);
1224 &Xupdate_avx_32_79(\&body_20_39);
1225 &Xupdate_avx_32_79(\&body_40_59);
1226 &Xupdate_avx_32_79(\&body_40_59);
1227 &Xupdate_avx_32_79(\&body_40_59);
1228 &Xupdate_avx_32_79(\&body_40_59);
1229 &Xupdate_avx_32_79(\&body_40_59);
1230 &Xupdate_avx_32_79(\&body_20_39);
1231 &Xuplast_avx_80(\&body_20_39); # can jump to "done"
1232
1233 $saved_j=$j; @saved_V=@V;
1234
1235 &Xloop_avx(\&body_20_39);
1236 &Xloop_avx(\&body_20_39);
1237 &Xloop_avx(\&body_20_39);
1238
1239$code.=<<___;
1240 add 0($ctx),$A # update context
1241 add 4($ctx),@T[0]
1242 add 8($ctx),$C
1243 add 12($ctx),$D
1244 mov $A,0($ctx)
1245 add 16($ctx),$E
1246 mov @T[0],4($ctx)
1247 mov @T[0],$B # magic seed
1248 mov $C,8($ctx)
1249 mov $C,@T[1]
1250 mov $D,12($ctx)
1251 xor $D,@T[1]
1252 mov $E,16($ctx)
1253 and @T[1],@T[0]
1254 jmp .Loop_avx
1255
1256.align 16
1257$done_avx_label:
1258___
1259 $j=$saved_j; @V=@saved_V;
1260
1261 &Xtail_avx(\&body_20_39);
1262 &Xtail_avx(\&body_20_39);
1263 &Xtail_avx(\&body_20_39);
1264
1265$code.=<<___;
1266 vzeroupper
1267
1268 add 0($ctx),$A # update context
1269 add 4($ctx),@T[0]
1270 add 8($ctx),$C
1271 mov $A,0($ctx)
1272 add 12($ctx),$D
1273 mov @T[0],4($ctx)
1274 add 16($ctx),$E
1275 mov $C,8($ctx)
1276 mov $D,12($ctx)
1277 mov $E,16($ctx)
1278___
1279$code.=<<___ if ($win64);
Adam Langleycb1b3332017-02-09 14:17:39 -08001280 movaps -40-6*16($fp),%xmm6
1281 movaps -40-5*16($fp),%xmm7
1282 movaps -40-4*16($fp),%xmm8
1283 movaps -40-3*16($fp),%xmm9
1284 movaps -40-2*16($fp),%xmm10
1285 movaps -40-1*16($fp),%xmm11
Adam Langley95c29f32014-06-20 12:00:00 -07001286___
1287$code.=<<___;
Adam Langleycb1b3332017-02-09 14:17:39 -08001288 mov -40($fp),%r14
Adam Langley6410e182018-08-07 11:26:15 -07001289.cfi_restore %r14
Adam Langleycb1b3332017-02-09 14:17:39 -08001290 mov -32($fp),%r13
Adam Langley6410e182018-08-07 11:26:15 -07001291.cfi_restore %r13
Adam Langleycb1b3332017-02-09 14:17:39 -08001292 mov -24($fp),%r12
Adam Langley6410e182018-08-07 11:26:15 -07001293.cfi_restore %r12
Adam Langleycb1b3332017-02-09 14:17:39 -08001294 mov -16($fp),%rbp
Adam Langley6410e182018-08-07 11:26:15 -07001295.cfi_restore %rbp
Adam Langleycb1b3332017-02-09 14:17:39 -08001296 mov -8($fp),%rbx
Adam Langley6410e182018-08-07 11:26:15 -07001297.cfi_restore %rbx
Adam Langleycb1b3332017-02-09 14:17:39 -08001298 lea ($fp),%rsp
Adam Langley6410e182018-08-07 11:26:15 -07001299.cfi_def_cfa_register %rsp
Adam Langley95c29f32014-06-20 12:00:00 -07001300.Lepilogue_avx:
1301 ret
Adam Langley6410e182018-08-07 11:26:15 -07001302.cfi_endproc
Adam Langley95c29f32014-06-20 12:00:00 -07001303.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
1304___
1305
1306if ($avx>1) {
1307use integer;
1308$Xi=4; # reset variables
1309@X=map("%ymm$_",(4..7,0..3));
1310@Tx=map("%ymm$_",(8..10));
1311$Kx="%ymm11";
1312$j=0;
1313
1314my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi");
1315my ($a5,$t0)=("%r12d","%edi");
1316
1317my ($A,$F,$B,$C,$D,$E)=@ROTX;
1318my $rx=0;
1319my $frame="%r13";
1320
1321$code.=<<___;
Brian Smith10c24cb2023-11-27 14:42:36 -08001322.globl sha1_block_data_order_avx2
Adam Langley95c29f32014-06-20 12:00:00 -07001323.type sha1_block_data_order_avx2,\@function,3
1324.align 16
1325sha1_block_data_order_avx2:
Adam Langley6410e182018-08-07 11:26:15 -07001326.cfi_startproc
Brian Smith10c24cb2023-11-27 14:42:36 -08001327 _CET_ENDBR
Adam Langleycb1b3332017-02-09 14:17:39 -08001328 mov %rsp,$fp
Adam Langley6410e182018-08-07 11:26:15 -07001329.cfi_def_cfa_register $fp
Adam Langley95c29f32014-06-20 12:00:00 -07001330 push %rbx
Adam Langley6410e182018-08-07 11:26:15 -07001331.cfi_push %rbx
Adam Langley95c29f32014-06-20 12:00:00 -07001332 push %rbp
Adam Langley6410e182018-08-07 11:26:15 -07001333.cfi_push %rbp
Adam Langley95c29f32014-06-20 12:00:00 -07001334 push %r12
Adam Langley6410e182018-08-07 11:26:15 -07001335.cfi_push %r12
Adam Langley95c29f32014-06-20 12:00:00 -07001336 push %r13
Adam Langley6410e182018-08-07 11:26:15 -07001337.cfi_push %r13
Adam Langley95c29f32014-06-20 12:00:00 -07001338 push %r14
Adam Langley6410e182018-08-07 11:26:15 -07001339.cfi_push %r14
Adam Langley95c29f32014-06-20 12:00:00 -07001340 vzeroupper
1341___
1342$code.=<<___ if ($win64);
1343 lea -6*16(%rsp),%rsp
Adam Langleycb1b3332017-02-09 14:17:39 -08001344 vmovaps %xmm6,-40-6*16($fp)
1345 vmovaps %xmm7,-40-5*16($fp)
1346 vmovaps %xmm8,-40-4*16($fp)
1347 vmovaps %xmm9,-40-3*16($fp)
1348 vmovaps %xmm10,-40-2*16($fp)
1349 vmovaps %xmm11,-40-1*16($fp)
Adam Langley95c29f32014-06-20 12:00:00 -07001350.Lprologue_avx2:
1351___
1352$code.=<<___;
Adam Langley95c29f32014-06-20 12:00:00 -07001353 mov %rdi,$ctx # reassigned argument
1354 mov %rsi,$inp # reassigned argument
1355 mov %rdx,$num # reassigned argument
1356
1357 lea -640(%rsp),%rsp
1358 shl \$6,$num
1359 lea 64($inp),$frame
1360 and \$-128,%rsp
1361 add $inp,$num
1362 lea K_XX_XX+64(%rip),$K_XX_XX
1363
1364 mov 0($ctx),$A # load context
1365 cmp $num,$frame
1366 cmovae $inp,$frame # next or same block
1367 mov 4($ctx),$F
1368 mov 8($ctx),$C
1369 mov 12($ctx),$D
1370 mov 16($ctx),$E
1371 vmovdqu 64($K_XX_XX),@X[2] # pbswap mask
1372
1373 vmovdqu ($inp),%xmm0
1374 vmovdqu 16($inp),%xmm1
1375 vmovdqu 32($inp),%xmm2
1376 vmovdqu 48($inp),%xmm3
1377 lea 64($inp),$inp
1378 vinserti128 \$1,($frame),@X[-4&7],@X[-4&7]
1379 vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7]
1380 vpshufb @X[2],@X[-4&7],@X[-4&7]
1381 vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7]
1382 vpshufb @X[2],@X[-3&7],@X[-3&7]
1383 vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7]
1384 vpshufb @X[2],@X[-2&7],@X[-2&7]
1385 vmovdqu -64($K_XX_XX),$Kx # K_00_19
1386 vpshufb @X[2],@X[-1&7],@X[-1&7]
1387
1388 vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
1389 vpaddd $Kx,@X[-3&7],@X[1]
1390 vmovdqu @X[0],0(%rsp) # X[]+K xfer to IALU
1391 vpaddd $Kx,@X[-2&7],@X[2]
1392 vmovdqu @X[1],32(%rsp)
1393 vpaddd $Kx,@X[-1&7],@X[3]
1394 vmovdqu @X[2],64(%rsp)
1395 vmovdqu @X[3],96(%rsp)
1396___
1397for (;$Xi<8;$Xi++) { # Xupdate_avx2_16_31
1398 use integer;
1399
1400 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
1401 &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
1402 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
1403 &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
1404 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
1405 &vpsrld (@Tx[0],@X[0],31);
1406 &vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
1407 &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
1408 &vpaddd (@X[0],@X[0],@X[0]);
1409 &vpsrld (@Tx[1],@Tx[2],30);
1410 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
1411 &vpslld (@Tx[2],@Tx[2],2);
1412 &vpxor (@X[0],@X[0],@Tx[1]);
1413 &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
1414 &vpaddd (@Tx[1],@X[0],$Kx);
1415 &vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU
1416
1417 push(@X,shift(@X)); # "rotate" X[]
1418}
1419$code.=<<___;
1420 lea 128(%rsp),$frame
1421 jmp .Loop_avx2
1422.align 32
1423.Loop_avx2:
1424 rorx \$2,$F,$B
1425 andn $D,$F,$t0
1426 and $C,$F
1427 xor $t0,$F
1428___
1429sub bodyx_00_19 () { # 8 instructions, 3 cycles critical path
1430 # at start $f=(b&c)^(~b&d), $b>>>=2
1431 return &bodyx_20_39() if ($rx==19); $rx++;
1432 (
1433 '($a,$f,$b,$c,$d,$e)=@ROTX;'.
1434
1435 '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K
1436 '&lea ($frame,"256($frame)") if ($j%32==31);',
1437 '&andn ($t0,$a,$c)', # ~b&d for next round
1438
1439 '&add ($e,$f)', # e+=(b&c)^(~b&d)
1440 '&rorx ($a5,$a,27)', # a<<<5
1441 '&rorx ($f,$a,2)', # b>>>2 for next round
1442 '&and ($a,$b)', # b&c for next round
1443
1444 '&add ($e,$a5)', # e+=a<<<5
1445 '&xor ($a,$t0);'. # f=(b&c)^(~b&d) for next round
1446
1447 'unshift(@ROTX,pop(@ROTX)); $j++;'
1448 )
1449}
1450
1451sub bodyx_20_39 () { # 7 instructions, 2 cycles critical path
1452 # on entry $f=b^c^d, $b>>>=2
1453 return &bodyx_40_59() if ($rx==39); $rx++;
1454 (
1455 '($a,$f,$b,$c,$d,$e)=@ROTX;'.
1456
1457 '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K
1458 '&lea ($frame,"256($frame)") if ($j%32==31);',
1459
1460 '&lea ($e,"($e,$f)")', # e+=b^c^d
1461 '&rorx ($a5,$a,27)', # a<<<5
1462 '&rorx ($f,$a,2) if ($j<79)', # b>>>2 in next round
1463 '&xor ($a,$b) if ($j<79)', # b^c for next round
1464
1465 '&add ($e,$a5)', # e+=a<<<5
1466 '&xor ($a,$c) if ($j<79);'. # f=b^c^d for next round
1467
1468 'unshift(@ROTX,pop(@ROTX)); $j++;'
1469 )
1470}
1471
1472sub bodyx_40_59 () { # 10 instructions, 3 cycles critical path
1473 # on entry $f=((b^c)&(c^d)), $b>>>=2
1474 $rx++;
1475 (
1476 '($a,$f,$b,$c,$d,$e)=@ROTX;'.
1477
1478 '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K
1479 '&lea ($frame,"256($frame)") if ($j%32==31);',
1480 '&xor ($f,$c) if ($j>39)', # (b^c)&(c^d)^c
1481 '&mov ($t0,$b) if ($j<59)', # count on zero latency
1482 '&xor ($t0,$c) if ($j<59)', # c^d for next round
1483
1484 '&lea ($e,"($e,$f)")', # e+=(b^c)&(c^d)^c
1485 '&rorx ($a5,$a,27)', # a<<<5
1486 '&rorx ($f,$a,2)', # b>>>2 in next round
1487 '&xor ($a,$b)', # b^c for next round
1488
1489 '&add ($e,$a5)', # e+=a<<<5
1490 '&and ($a,$t0) if ($j< 59);'. # f=(b^c)&(c^d) for next round
1491 '&xor ($a,$c) if ($j==59);'. # f=b^c^d for next round
1492
1493 'unshift(@ROTX,pop(@ROTX)); $j++;'
1494 )
1495}
1496
David Benjamind4e37952017-07-25 16:59:58 -04001497sub Xupdate_avx2_16_31() # recall that $Xi starts with 4
Adam Langley95c29f32014-06-20 12:00:00 -07001498{ use integer;
1499 my $body = shift;
1500 my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 instructions
1501 my ($a,$b,$c,$d,$e);
1502
1503 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
1504 eval(shift(@insns));
1505 eval(shift(@insns));
1506 eval(shift(@insns));
1507 eval(shift(@insns));
1508
1509 &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
1510 eval(shift(@insns));
1511 eval(shift(@insns));
1512 eval(shift(@insns));
1513
1514 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
1515 &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
1516 eval(shift(@insns));
1517 eval(shift(@insns));
1518
1519 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
1520 eval(shift(@insns));
1521 eval(shift(@insns));
1522 eval(shift(@insns));
1523 eval(shift(@insns));
1524
1525 &vpsrld (@Tx[0],@X[0],31);
1526 &vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
1527 eval(shift(@insns));
1528 eval(shift(@insns));
1529 eval(shift(@insns));
1530
1531 &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
1532 &vpaddd (@X[0],@X[0],@X[0]);
1533 eval(shift(@insns));
1534 eval(shift(@insns));
1535
1536 &vpsrld (@Tx[1],@Tx[2],30);
1537 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
1538 eval(shift(@insns));
1539 eval(shift(@insns));
1540
1541 &vpslld (@Tx[2],@Tx[2],2);
1542 &vpxor (@X[0],@X[0],@Tx[1]);
1543 eval(shift(@insns));
1544 eval(shift(@insns));
1545
1546 &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
1547 eval(shift(@insns));
1548 eval(shift(@insns));
1549 eval(shift(@insns));
1550
1551 &vpaddd (@Tx[1],@X[0],$Kx);
1552 eval(shift(@insns));
1553 eval(shift(@insns));
1554 eval(shift(@insns));
1555 &vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
1556
1557 foreach (@insns) { eval; } # remaining instructions [if any]
1558
1559 $Xi++;
1560 push(@X,shift(@X)); # "rotate" X[]
1561}
1562
1563sub Xupdate_avx2_32_79()
1564{ use integer;
1565 my $body = shift;
1566 my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 to 50 instructions
1567 my ($a,$b,$c,$d,$e);
1568
1569 &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
1570 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
1571 eval(shift(@insns));
1572 eval(shift(@insns));
1573
1574 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
1575 &vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0);
1576 eval(shift(@insns));
1577 eval(shift(@insns));
1578 eval(shift(@insns));
1579
1580 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
1581 eval(shift(@insns));
1582 eval(shift(@insns));
1583 eval(shift(@insns));
1584
1585 &vpsrld (@Tx[0],@X[0],30);
1586 &vpslld (@X[0],@X[0],2);
1587 eval(shift(@insns));
1588 eval(shift(@insns));
1589 eval(shift(@insns));
1590
1591 #&vpslld (@X[0],@X[0],2);
1592 eval(shift(@insns));
1593 eval(shift(@insns));
1594 eval(shift(@insns));
1595
1596 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
1597 eval(shift(@insns));
1598 eval(shift(@insns));
1599 eval(shift(@insns));
1600 eval(shift(@insns));
1601
1602 &vpaddd (@Tx[1],@X[0],$Kx);
1603 eval(shift(@insns));
1604 eval(shift(@insns));
1605 eval(shift(@insns));
1606 eval(shift(@insns));
1607
1608 &vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU
1609
1610 foreach (@insns) { eval; } # remaining instructions
1611
1612 $Xi++;
1613 push(@X,shift(@X)); # "rotate" X[]
1614}
1615
1616sub Xloop_avx2()
1617{ use integer;
1618 my $body = shift;
1619 my @insns = (&$body,&$body,&$body,&$body,&$body); # 32 instructions
1620 my ($a,$b,$c,$d,$e);
1621
1622 foreach (@insns) { eval; }
1623}
1624
1625 &align32();
1626 &Xupdate_avx2_32_79(\&bodyx_00_19);
1627 &Xupdate_avx2_32_79(\&bodyx_00_19);
1628 &Xupdate_avx2_32_79(\&bodyx_00_19);
1629 &Xupdate_avx2_32_79(\&bodyx_00_19);
1630
1631 &Xupdate_avx2_32_79(\&bodyx_20_39);
1632 &Xupdate_avx2_32_79(\&bodyx_20_39);
1633 &Xupdate_avx2_32_79(\&bodyx_20_39);
1634 &Xupdate_avx2_32_79(\&bodyx_20_39);
1635
1636 &align32();
1637 &Xupdate_avx2_32_79(\&bodyx_40_59);
1638 &Xupdate_avx2_32_79(\&bodyx_40_59);
1639 &Xupdate_avx2_32_79(\&bodyx_40_59);
1640 &Xupdate_avx2_32_79(\&bodyx_40_59);
1641
1642 &Xloop_avx2(\&bodyx_20_39);
1643 &Xloop_avx2(\&bodyx_20_39);
1644 &Xloop_avx2(\&bodyx_20_39);
1645 &Xloop_avx2(\&bodyx_20_39);
1646
1647$code.=<<___;
1648 lea 128($inp),$frame
1649 lea 128($inp),%rdi # borrow $t0
1650 cmp $num,$frame
1651 cmovae $inp,$frame # next or previous block
1652
1653 # output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
1654 add 0($ctx),@ROTX[0] # update context
1655 add 4($ctx),@ROTX[1]
1656 add 8($ctx),@ROTX[3]
1657 mov @ROTX[0],0($ctx)
1658 add 12($ctx),@ROTX[4]
1659 mov @ROTX[1],4($ctx)
1660 mov @ROTX[0],$A # A=d
1661 add 16($ctx),@ROTX[5]
1662 mov @ROTX[3],$a5
1663 mov @ROTX[3],8($ctx)
1664 mov @ROTX[4],$D # D=b
1665 #xchg @ROTX[5],$F # F=c, C=f
1666 mov @ROTX[4],12($ctx)
1667 mov @ROTX[1],$F # F=e
1668 mov @ROTX[5],16($ctx)
1669 #mov $F,16($ctx)
1670 mov @ROTX[5],$E # E=c
1671 mov $a5,$C # C=f
1672 #xchg $F,$E # E=c, F=e
1673
1674 cmp $num,$inp
1675 je .Ldone_avx2
1676___
1677
1678$Xi=4; # reset variables
1679@X=map("%ymm$_",(4..7,0..3));
1680
1681$code.=<<___;
1682 vmovdqu 64($K_XX_XX),@X[2] # pbswap mask
1683 cmp $num,%rdi # borrowed $t0
1684 ja .Last_avx2
1685
1686 vmovdqu -64(%rdi),%xmm0 # low part of @X[-4&7]
1687 vmovdqu -48(%rdi),%xmm1
1688 vmovdqu -32(%rdi),%xmm2
1689 vmovdqu -16(%rdi),%xmm3
1690 vinserti128 \$1,0($frame),@X[-4&7],@X[-4&7]
1691 vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7]
1692 vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7]
1693 vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7]
1694 jmp .Last_avx2
1695
1696.align 32
1697.Last_avx2:
1698 lea 128+16(%rsp),$frame
1699 rorx \$2,$F,$B
1700 andn $D,$F,$t0
1701 and $C,$F
1702 xor $t0,$F
1703 sub \$-128,$inp
1704___
1705 $rx=$j=0; @ROTX=($A,$F,$B,$C,$D,$E);
1706
1707 &Xloop_avx2 (\&bodyx_00_19);
1708 &Xloop_avx2 (\&bodyx_00_19);
1709 &Xloop_avx2 (\&bodyx_00_19);
1710 &Xloop_avx2 (\&bodyx_00_19);
1711
1712 &Xloop_avx2 (\&bodyx_20_39);
1713 &vmovdqu ($Kx,"-64($K_XX_XX)"); # K_00_19
1714 &vpshufb (@X[-4&7],@X[-4&7],@X[2]); # byte swap
1715 &Xloop_avx2 (\&bodyx_20_39);
1716 &vpshufb (@X[-3&7],@X[-3&7],@X[2]);
1717 &vpaddd (@Tx[0],@X[-4&7],$Kx); # add K_00_19
1718 &Xloop_avx2 (\&bodyx_20_39);
1719 &vmovdqu ("0(%rsp)",@Tx[0]);
1720 &vpshufb (@X[-2&7],@X[-2&7],@X[2]);
1721 &vpaddd (@Tx[1],@X[-3&7],$Kx);
1722 &Xloop_avx2 (\&bodyx_20_39);
1723 &vmovdqu ("32(%rsp)",@Tx[1]);
1724 &vpshufb (@X[-1&7],@X[-1&7],@X[2]);
1725 &vpaddd (@X[2],@X[-2&7],$Kx);
1726
1727 &Xloop_avx2 (\&bodyx_40_59);
1728 &align32 ();
1729 &vmovdqu ("64(%rsp)",@X[2]);
1730 &vpaddd (@X[3],@X[-1&7],$Kx);
1731 &Xloop_avx2 (\&bodyx_40_59);
1732 &vmovdqu ("96(%rsp)",@X[3]);
1733 &Xloop_avx2 (\&bodyx_40_59);
1734 &Xupdate_avx2_16_31(\&bodyx_40_59);
1735
1736 &Xupdate_avx2_16_31(\&bodyx_20_39);
1737 &Xupdate_avx2_16_31(\&bodyx_20_39);
1738 &Xupdate_avx2_16_31(\&bodyx_20_39);
1739 &Xloop_avx2 (\&bodyx_20_39);
1740
1741$code.=<<___;
1742 lea 128(%rsp),$frame
1743
1744 # output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
1745 add 0($ctx),@ROTX[0] # update context
1746 add 4($ctx),@ROTX[1]
1747 add 8($ctx),@ROTX[3]
1748 mov @ROTX[0],0($ctx)
1749 add 12($ctx),@ROTX[4]
1750 mov @ROTX[1],4($ctx)
1751 mov @ROTX[0],$A # A=d
1752 add 16($ctx),@ROTX[5]
1753 mov @ROTX[3],$a5
1754 mov @ROTX[3],8($ctx)
1755 mov @ROTX[4],$D # D=b
1756 #xchg @ROTX[5],$F # F=c, C=f
1757 mov @ROTX[4],12($ctx)
1758 mov @ROTX[1],$F # F=e
1759 mov @ROTX[5],16($ctx)
1760 #mov $F,16($ctx)
1761 mov @ROTX[5],$E # E=c
1762 mov $a5,$C # C=f
1763 #xchg $F,$E # E=c, F=e
1764
1765 cmp $num,$inp
1766 jbe .Loop_avx2
1767
1768.Ldone_avx2:
1769 vzeroupper
1770___
1771$code.=<<___ if ($win64);
Adam Langleycb1b3332017-02-09 14:17:39 -08001772 movaps -40-6*16($fp),%xmm6
1773 movaps -40-5*16($fp),%xmm7
1774 movaps -40-4*16($fp),%xmm8
1775 movaps -40-3*16($fp),%xmm9
1776 movaps -40-2*16($fp),%xmm10
1777 movaps -40-1*16($fp),%xmm11
Adam Langley95c29f32014-06-20 12:00:00 -07001778___
1779$code.=<<___;
Adam Langleycb1b3332017-02-09 14:17:39 -08001780 mov -40($fp),%r14
Adam Langley6410e182018-08-07 11:26:15 -07001781.cfi_restore %r14
Adam Langleycb1b3332017-02-09 14:17:39 -08001782 mov -32($fp),%r13
Adam Langley6410e182018-08-07 11:26:15 -07001783.cfi_restore %r13
Adam Langleycb1b3332017-02-09 14:17:39 -08001784 mov -24($fp),%r12
Adam Langley6410e182018-08-07 11:26:15 -07001785.cfi_restore %r12
Adam Langleycb1b3332017-02-09 14:17:39 -08001786 mov -16($fp),%rbp
Adam Langley6410e182018-08-07 11:26:15 -07001787.cfi_restore %rbp
Adam Langleycb1b3332017-02-09 14:17:39 -08001788 mov -8($fp),%rbx
Adam Langley6410e182018-08-07 11:26:15 -07001789.cfi_restore %rbx
Adam Langleycb1b3332017-02-09 14:17:39 -08001790 lea ($fp),%rsp
Adam Langley6410e182018-08-07 11:26:15 -07001791.cfi_def_cfa_register %rsp
Adam Langley95c29f32014-06-20 12:00:00 -07001792.Lepilogue_avx2:
1793 ret
Adam Langley6410e182018-08-07 11:26:15 -07001794.cfi_endproc
Adam Langley95c29f32014-06-20 12:00:00 -07001795.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
1796___
1797}
1798}
1799$code.=<<___;
Theo Buehlerebd43ef2023-02-17 10:05:47 +01001800.section .rodata
Adam Langley95c29f32014-06-20 12:00:00 -07001801.align 64
1802K_XX_XX:
1803.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
1804.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
1805.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
1806.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
1807.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
1808.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
1809.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
1810.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
1811.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
1812.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
Adam Langleycb5dd632014-06-20 12:00:00 -07001813.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
Adam Langley95c29f32014-06-20 12:00:00 -07001814___
1815}}}
1816$code.=<<___;
1817.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1818.align 64
Theo Buehlerebd43ef2023-02-17 10:05:47 +01001819.text
Adam Langley95c29f32014-06-20 12:00:00 -07001820___
1821
1822# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1823# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1824if ($win64) {
1825$rec="%rcx";
1826$frame="%rdx";
1827$context="%r8";
1828$disp="%r9";
1829
1830$code.=<<___;
1831.extern __imp_RtlVirtualUnwind
1832.type se_handler,\@abi-omnipotent
1833.align 16
1834se_handler:
1835 push %rsi
1836 push %rdi
1837 push %rbx
1838 push %rbp
1839 push %r12
1840 push %r13
1841 push %r14
1842 push %r15
1843 pushfq
1844 sub \$64,%rsp
1845
1846 mov 120($context),%rax # pull context->Rax
1847 mov 248($context),%rbx # pull context->Rip
1848
1849 lea .Lprologue(%rip),%r10
1850 cmp %r10,%rbx # context->Rip<.Lprologue
1851 jb .Lcommon_seh_tail
1852
1853 mov 152($context),%rax # pull context->Rsp
1854
1855 lea .Lepilogue(%rip),%r10
1856 cmp %r10,%rbx # context->Rip>=.Lepilogue
1857 jae .Lcommon_seh_tail
1858
1859 mov `16*4`(%rax),%rax # pull saved stack pointer
Adam Langley95c29f32014-06-20 12:00:00 -07001860
1861 mov -8(%rax),%rbx
1862 mov -16(%rax),%rbp
1863 mov -24(%rax),%r12
1864 mov -32(%rax),%r13
Adam Langley5c6ca972014-06-20 12:00:00 -07001865 mov -40(%rax),%r14
Adam Langley95c29f32014-06-20 12:00:00 -07001866 mov %rbx,144($context) # restore context->Rbx
1867 mov %rbp,160($context) # restore context->Rbp
1868 mov %r12,216($context) # restore context->R12
1869 mov %r13,224($context) # restore context->R13
Adam Langley5c6ca972014-06-20 12:00:00 -07001870 mov %r14,232($context) # restore context->R14
Adam Langley95c29f32014-06-20 12:00:00 -07001871
1872 jmp .Lcommon_seh_tail
1873.size se_handler,.-se_handler
Adam Langley3ffd70e2014-06-20 12:00:00 -07001874___
Adam Langley95c29f32014-06-20 12:00:00 -07001875
Adam Langley3ffd70e2014-06-20 12:00:00 -07001876$code.=<<___ if ($shaext);
Adam Langleycb5dd632014-06-20 12:00:00 -07001877.type shaext_handler,\@abi-omnipotent
1878.align 16
1879shaext_handler:
1880 push %rsi
1881 push %rdi
1882 push %rbx
1883 push %rbp
1884 push %r12
1885 push %r13
1886 push %r14
1887 push %r15
1888 pushfq
1889 sub \$64,%rsp
1890
1891 mov 120($context),%rax # pull context->Rax
1892 mov 248($context),%rbx # pull context->Rip
1893
1894 lea .Lprologue_shaext(%rip),%r10
1895 cmp %r10,%rbx # context->Rip<.Lprologue
1896 jb .Lcommon_seh_tail
1897
1898 lea .Lepilogue_shaext(%rip),%r10
1899 cmp %r10,%rbx # context->Rip>=.Lepilogue
1900 jae .Lcommon_seh_tail
1901
1902 lea -8-4*16(%rax),%rsi
1903 lea 512($context),%rdi # &context.Xmm6
1904 mov \$8,%ecx
1905 .long 0xa548f3fc # cld; rep movsq
1906
1907 jmp .Lcommon_seh_tail
1908.size shaext_handler,.-shaext_handler
Adam Langley3ffd70e2014-06-20 12:00:00 -07001909___
Adam Langleycb5dd632014-06-20 12:00:00 -07001910
Adam Langley3ffd70e2014-06-20 12:00:00 -07001911$code.=<<___;
Adam Langley95c29f32014-06-20 12:00:00 -07001912.type ssse3_handler,\@abi-omnipotent
1913.align 16
1914ssse3_handler:
1915 push %rsi
1916 push %rdi
1917 push %rbx
1918 push %rbp
1919 push %r12
1920 push %r13
1921 push %r14
1922 push %r15
1923 pushfq
1924 sub \$64,%rsp
1925
1926 mov 120($context),%rax # pull context->Rax
1927 mov 248($context),%rbx # pull context->Rip
1928
1929 mov 8($disp),%rsi # disp->ImageBase
1930 mov 56($disp),%r11 # disp->HandlerData
1931
1932 mov 0(%r11),%r10d # HandlerData[0]
1933 lea (%rsi,%r10),%r10 # prologue label
1934 cmp %r10,%rbx # context->Rip<prologue label
1935 jb .Lcommon_seh_tail
1936
Adam Langleycb1b3332017-02-09 14:17:39 -08001937 mov 208($context),%rax # pull context->R11
Adam Langley95c29f32014-06-20 12:00:00 -07001938
1939 mov 4(%r11),%r10d # HandlerData[1]
1940 lea (%rsi,%r10),%r10 # epilogue label
1941 cmp %r10,%rbx # context->Rip>=epilogue label
1942 jae .Lcommon_seh_tail
1943
Adam Langley95c29f32014-06-20 12:00:00 -07001944 lea -40-6*16(%rax),%rsi
1945 lea 512($context),%rdi # &context.Xmm6
1946 mov \$12,%ecx
1947 .long 0xa548f3fc # cld; rep movsq
1948
1949 mov -8(%rax),%rbx
1950 mov -16(%rax),%rbp
1951 mov -24(%rax),%r12
1952 mov -32(%rax),%r13
1953 mov -40(%rax),%r14
1954 mov %rbx,144($context) # restore context->Rbx
1955 mov %rbp,160($context) # restore context->Rbp
Adam Langley6410e182018-08-07 11:26:15 -07001956 mov %r12,216($context) # restore context->R12
1957 mov %r13,224($context) # restore context->R13
1958 mov %r14,232($context) # restore context->R14
Adam Langley95c29f32014-06-20 12:00:00 -07001959
1960.Lcommon_seh_tail:
1961 mov 8(%rax),%rdi
1962 mov 16(%rax),%rsi
1963 mov %rax,152($context) # restore context->Rsp
1964 mov %rsi,168($context) # restore context->Rsi
1965 mov %rdi,176($context) # restore context->Rdi
1966
1967 mov 40($disp),%rdi # disp->ContextRecord
1968 mov $context,%rsi # context
1969 mov \$154,%ecx # sizeof(CONTEXT)
1970 .long 0xa548f3fc # cld; rep movsq
1971
1972 mov $disp,%rsi
1973 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1974 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1975 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1976 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1977 mov 40(%rsi),%r10 # disp->ContextRecord
1978 lea 56(%rsi),%r11 # &disp->HandlerData
1979 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1980 mov %r10,32(%rsp) # arg5
1981 mov %r11,40(%rsp) # arg6
1982 mov %r12,48(%rsp) # arg7
1983 mov %rcx,56(%rsp) # arg8, (NULL)
1984 call *__imp_RtlVirtualUnwind(%rip)
1985
1986 mov \$1,%eax # ExceptionContinueSearch
1987 add \$64,%rsp
1988 popfq
1989 pop %r15
1990 pop %r14
1991 pop %r13
1992 pop %r12
1993 pop %rbp
1994 pop %rbx
1995 pop %rdi
1996 pop %rsi
1997 ret
1998.size ssse3_handler,.-ssse3_handler
1999
2000.section .pdata
2001.align 4
Brian Smith10c24cb2023-11-27 14:42:36 -08002002 .rva .LSEH_begin_sha1_block_data_order_nohw
2003 .rva .LSEH_end_sha1_block_data_order_nohw
2004 .rva .LSEH_info_sha1_block_data_order_nohw
Adam Langley006779a2014-06-20 12:00:00 -07002005___
2006$code.=<<___ if ($shaext);
Brian Smith10c24cb2023-11-27 14:42:36 -08002007 .rva .LSEH_begin_sha1_block_data_order_hw
2008 .rva .LSEH_end_sha1_block_data_order_hw
2009 .rva .LSEH_info_sha1_block_data_order_hw
Adam Langley006779a2014-06-20 12:00:00 -07002010___
2011$code.=<<___;
Adam Langley95c29f32014-06-20 12:00:00 -07002012 .rva .LSEH_begin_sha1_block_data_order_ssse3
2013 .rva .LSEH_end_sha1_block_data_order_ssse3
2014 .rva .LSEH_info_sha1_block_data_order_ssse3
2015___
2016$code.=<<___ if ($avx);
2017 .rva .LSEH_begin_sha1_block_data_order_avx
2018 .rva .LSEH_end_sha1_block_data_order_avx
2019 .rva .LSEH_info_sha1_block_data_order_avx
2020___
2021$code.=<<___ if ($avx>1);
2022 .rva .LSEH_begin_sha1_block_data_order_avx2
2023 .rva .LSEH_end_sha1_block_data_order_avx2
2024 .rva .LSEH_info_sha1_block_data_order_avx2
2025___
2026$code.=<<___;
2027.section .xdata
2028.align 8
Brian Smith10c24cb2023-11-27 14:42:36 -08002029.LSEH_info_sha1_block_data_order_nohw:
Adam Langley95c29f32014-06-20 12:00:00 -07002030 .byte 9,0,0,0
2031 .rva se_handler
Adam Langley3ffd70e2014-06-20 12:00:00 -07002032___
2033$code.=<<___ if ($shaext);
Brian Smith10c24cb2023-11-27 14:42:36 -08002034.LSEH_info_sha1_block_data_order_hw:
Adam Langleycb5dd632014-06-20 12:00:00 -07002035 .byte 9,0,0,0
2036 .rva shaext_handler
Adam Langley3ffd70e2014-06-20 12:00:00 -07002037___
2038$code.=<<___;
Adam Langley95c29f32014-06-20 12:00:00 -07002039.LSEH_info_sha1_block_data_order_ssse3:
2040 .byte 9,0,0,0
2041 .rva ssse3_handler
2042 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
2043___
2044$code.=<<___ if ($avx);
2045.LSEH_info_sha1_block_data_order_avx:
2046 .byte 9,0,0,0
2047 .rva ssse3_handler
2048 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
2049___
2050$code.=<<___ if ($avx>1);
2051.LSEH_info_sha1_block_data_order_avx2:
2052 .byte 9,0,0,0
2053 .rva ssse3_handler
2054 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
2055___
2056}
2057
2058####################################################################
2059
Adam Langleycb5dd632014-06-20 12:00:00 -07002060foreach (split("\n",$code)) {
2061 s/\`([^\`]*)\`/eval $1/geo;
2062
Adam Langleycb5dd632014-06-20 12:00:00 -07002063 print $_,"\n";
2064}
David Benjamin549e4e72021-08-05 14:32:53 -04002065close STDOUT or die "error closing STDOUT: $!";