blob: e00bb9fda582e1b8f03f3b454ff4d2b135366c64 [file] [log] [blame]
David Benjaminfe0c91e2024-03-18 15:37:24 +10001// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <openssl/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
7.text
8.globl gcm_init_clmul
9.hidden gcm_init_clmul
10.type gcm_init_clmul,@function
11.align 16
12gcm_init_clmul:
13.cfi_startproc
14
15_CET_ENDBR
16.L_init_clmul:
17 movdqu (%rsi),%xmm2
18 pshufd $78,%xmm2,%xmm2
19
20
21 pshufd $255,%xmm2,%xmm4
22 movdqa %xmm2,%xmm3
23 psllq $1,%xmm2
24 pxor %xmm5,%xmm5
25 psrlq $63,%xmm3
26 pcmpgtd %xmm4,%xmm5
27 pslldq $8,%xmm3
28 por %xmm3,%xmm2
29
30
31 pand .L0x1c2_polynomial(%rip),%xmm5
32 pxor %xmm5,%xmm2
33
34
35 pshufd $78,%xmm2,%xmm6
36 movdqa %xmm2,%xmm0
37 pxor %xmm2,%xmm6
38 movdqa %xmm0,%xmm1
39 pshufd $78,%xmm0,%xmm3
40 pxor %xmm0,%xmm3
41.byte 102,15,58,68,194,0
42.byte 102,15,58,68,202,17
43.byte 102,15,58,68,222,0
44 pxor %xmm0,%xmm3
45 pxor %xmm1,%xmm3
46
47 movdqa %xmm3,%xmm4
48 psrldq $8,%xmm3
49 pslldq $8,%xmm4
50 pxor %xmm3,%xmm1
51 pxor %xmm4,%xmm0
52
53 movdqa %xmm0,%xmm4
54 movdqa %xmm0,%xmm3
55 psllq $5,%xmm0
56 pxor %xmm0,%xmm3
57 psllq $1,%xmm0
58 pxor %xmm3,%xmm0
59 psllq $57,%xmm0
60 movdqa %xmm0,%xmm3
61 pslldq $8,%xmm0
62 psrldq $8,%xmm3
63 pxor %xmm4,%xmm0
64 pxor %xmm3,%xmm1
65
66
67 movdqa %xmm0,%xmm4
68 psrlq $1,%xmm0
69 pxor %xmm4,%xmm1
70 pxor %xmm0,%xmm4
71 psrlq $5,%xmm0
72 pxor %xmm4,%xmm0
73 psrlq $1,%xmm0
74 pxor %xmm1,%xmm0
75 pshufd $78,%xmm2,%xmm3
76 pshufd $78,%xmm0,%xmm4
77 pxor %xmm2,%xmm3
78 movdqu %xmm2,0(%rdi)
79 pxor %xmm0,%xmm4
80 movdqu %xmm0,16(%rdi)
81.byte 102,15,58,15,227,8
82 movdqu %xmm4,32(%rdi)
83 movdqa %xmm0,%xmm1
84 pshufd $78,%xmm0,%xmm3
85 pxor %xmm0,%xmm3
86.byte 102,15,58,68,194,0
87.byte 102,15,58,68,202,17
88.byte 102,15,58,68,222,0
89 pxor %xmm0,%xmm3
90 pxor %xmm1,%xmm3
91
92 movdqa %xmm3,%xmm4
93 psrldq $8,%xmm3
94 pslldq $8,%xmm4
95 pxor %xmm3,%xmm1
96 pxor %xmm4,%xmm0
97
98 movdqa %xmm0,%xmm4
99 movdqa %xmm0,%xmm3
100 psllq $5,%xmm0
101 pxor %xmm0,%xmm3
102 psllq $1,%xmm0
103 pxor %xmm3,%xmm0
104 psllq $57,%xmm0
105 movdqa %xmm0,%xmm3
106 pslldq $8,%xmm0
107 psrldq $8,%xmm3
108 pxor %xmm4,%xmm0
109 pxor %xmm3,%xmm1
110
111
112 movdqa %xmm0,%xmm4
113 psrlq $1,%xmm0
114 pxor %xmm4,%xmm1
115 pxor %xmm0,%xmm4
116 psrlq $5,%xmm0
117 pxor %xmm4,%xmm0
118 psrlq $1,%xmm0
119 pxor %xmm1,%xmm0
120 movdqa %xmm0,%xmm5
121 movdqa %xmm0,%xmm1
122 pshufd $78,%xmm0,%xmm3
123 pxor %xmm0,%xmm3
124.byte 102,15,58,68,194,0
125.byte 102,15,58,68,202,17
126.byte 102,15,58,68,222,0
127 pxor %xmm0,%xmm3
128 pxor %xmm1,%xmm3
129
130 movdqa %xmm3,%xmm4
131 psrldq $8,%xmm3
132 pslldq $8,%xmm4
133 pxor %xmm3,%xmm1
134 pxor %xmm4,%xmm0
135
136 movdqa %xmm0,%xmm4
137 movdqa %xmm0,%xmm3
138 psllq $5,%xmm0
139 pxor %xmm0,%xmm3
140 psllq $1,%xmm0
141 pxor %xmm3,%xmm0
142 psllq $57,%xmm0
143 movdqa %xmm0,%xmm3
144 pslldq $8,%xmm0
145 psrldq $8,%xmm3
146 pxor %xmm4,%xmm0
147 pxor %xmm3,%xmm1
148
149
150 movdqa %xmm0,%xmm4
151 psrlq $1,%xmm0
152 pxor %xmm4,%xmm1
153 pxor %xmm0,%xmm4
154 psrlq $5,%xmm0
155 pxor %xmm4,%xmm0
156 psrlq $1,%xmm0
157 pxor %xmm1,%xmm0
158 pshufd $78,%xmm5,%xmm3
159 pshufd $78,%xmm0,%xmm4
160 pxor %xmm5,%xmm3
161 movdqu %xmm5,48(%rdi)
162 pxor %xmm0,%xmm4
163 movdqu %xmm0,64(%rdi)
164.byte 102,15,58,15,227,8
165 movdqu %xmm4,80(%rdi)
166 ret
167.cfi_endproc
168
169.size gcm_init_clmul,.-gcm_init_clmul
170.globl gcm_gmult_clmul
171.hidden gcm_gmult_clmul
172.type gcm_gmult_clmul,@function
173.align 16
174gcm_gmult_clmul:
175.cfi_startproc
176_CET_ENDBR
177.L_gmult_clmul:
178 movdqu (%rdi),%xmm0
179 movdqa .Lbswap_mask(%rip),%xmm5
180 movdqu (%rsi),%xmm2
181 movdqu 32(%rsi),%xmm4
182.byte 102,15,56,0,197
183 movdqa %xmm0,%xmm1
184 pshufd $78,%xmm0,%xmm3
185 pxor %xmm0,%xmm3
186.byte 102,15,58,68,194,0
187.byte 102,15,58,68,202,17
188.byte 102,15,58,68,220,0
189 pxor %xmm0,%xmm3
190 pxor %xmm1,%xmm3
191
192 movdqa %xmm3,%xmm4
193 psrldq $8,%xmm3
194 pslldq $8,%xmm4
195 pxor %xmm3,%xmm1
196 pxor %xmm4,%xmm0
197
198 movdqa %xmm0,%xmm4
199 movdqa %xmm0,%xmm3
200 psllq $5,%xmm0
201 pxor %xmm0,%xmm3
202 psllq $1,%xmm0
203 pxor %xmm3,%xmm0
204 psllq $57,%xmm0
205 movdqa %xmm0,%xmm3
206 pslldq $8,%xmm0
207 psrldq $8,%xmm3
208 pxor %xmm4,%xmm0
209 pxor %xmm3,%xmm1
210
211
212 movdqa %xmm0,%xmm4
213 psrlq $1,%xmm0
214 pxor %xmm4,%xmm1
215 pxor %xmm0,%xmm4
216 psrlq $5,%xmm0
217 pxor %xmm4,%xmm0
218 psrlq $1,%xmm0
219 pxor %xmm1,%xmm0
220.byte 102,15,56,0,197
221 movdqu %xmm0,(%rdi)
222 ret
223.cfi_endproc
224.size gcm_gmult_clmul,.-gcm_gmult_clmul
225.globl gcm_ghash_clmul
226.hidden gcm_ghash_clmul
227.type gcm_ghash_clmul,@function
228.align 32
229gcm_ghash_clmul:
230.cfi_startproc
231
232_CET_ENDBR
233.L_ghash_clmul:
234 movdqa .Lbswap_mask(%rip),%xmm10
235
236 movdqu (%rdi),%xmm0
237 movdqu (%rsi),%xmm2
238 movdqu 32(%rsi),%xmm7
239.byte 102,65,15,56,0,194
240
241 subq $0x10,%rcx
242 jz .Lodd_tail
243
244 movdqu 16(%rsi),%xmm6
245 cmpq $0x30,%rcx
246 jb .Lskip4x
247
248 subq $0x30,%rcx
249 movq $0xA040608020C0E000,%rax
250 movdqu 48(%rsi),%xmm14
251 movdqu 64(%rsi),%xmm15
252
253
254
255
256 movdqu 48(%rdx),%xmm3
257 movdqu 32(%rdx),%xmm11
258.byte 102,65,15,56,0,218
259.byte 102,69,15,56,0,218
260 movdqa %xmm3,%xmm5
261 pshufd $78,%xmm3,%xmm4
262 pxor %xmm3,%xmm4
263.byte 102,15,58,68,218,0
264.byte 102,15,58,68,234,17
265.byte 102,15,58,68,231,0
266
267 movdqa %xmm11,%xmm13
268 pshufd $78,%xmm11,%xmm12
269 pxor %xmm11,%xmm12
270.byte 102,68,15,58,68,222,0
271.byte 102,68,15,58,68,238,17
272.byte 102,68,15,58,68,231,16
273 xorps %xmm11,%xmm3
274 xorps %xmm13,%xmm5
275 movups 80(%rsi),%xmm7
276 xorps %xmm12,%xmm4
277
278 movdqu 16(%rdx),%xmm11
279 movdqu 0(%rdx),%xmm8
280.byte 102,69,15,56,0,218
281.byte 102,69,15,56,0,194
282 movdqa %xmm11,%xmm13
283 pshufd $78,%xmm11,%xmm12
284 pxor %xmm8,%xmm0
285 pxor %xmm11,%xmm12
286.byte 102,69,15,58,68,222,0
287 movdqa %xmm0,%xmm1
288 pshufd $78,%xmm0,%xmm8
289 pxor %xmm0,%xmm8
290.byte 102,69,15,58,68,238,17
291.byte 102,68,15,58,68,231,0
292 xorps %xmm11,%xmm3
293 xorps %xmm13,%xmm5
294
295 leaq 64(%rdx),%rdx
296 subq $0x40,%rcx
297 jc .Ltail4x
298
299 jmp .Lmod4_loop
300.align 32
301.Lmod4_loop:
302.byte 102,65,15,58,68,199,0
303 xorps %xmm12,%xmm4
304 movdqu 48(%rdx),%xmm11
305.byte 102,69,15,56,0,218
306.byte 102,65,15,58,68,207,17
307 xorps %xmm3,%xmm0
308 movdqu 32(%rdx),%xmm3
309 movdqa %xmm11,%xmm13
310.byte 102,68,15,58,68,199,16
311 pshufd $78,%xmm11,%xmm12
312 xorps %xmm5,%xmm1
313 pxor %xmm11,%xmm12
314.byte 102,65,15,56,0,218
315 movups 32(%rsi),%xmm7
316 xorps %xmm4,%xmm8
317.byte 102,68,15,58,68,218,0
318 pshufd $78,%xmm3,%xmm4
319
320 pxor %xmm0,%xmm8
321 movdqa %xmm3,%xmm5
322 pxor %xmm1,%xmm8
323 pxor %xmm3,%xmm4
324 movdqa %xmm8,%xmm9
325.byte 102,68,15,58,68,234,17
326 pslldq $8,%xmm8
327 psrldq $8,%xmm9
328 pxor %xmm8,%xmm0
329 movdqa .L7_mask(%rip),%xmm8
330 pxor %xmm9,%xmm1
331.byte 102,76,15,110,200
332
333 pand %xmm0,%xmm8
334.byte 102,69,15,56,0,200
335 pxor %xmm0,%xmm9
336.byte 102,68,15,58,68,231,0
337 psllq $57,%xmm9
338 movdqa %xmm9,%xmm8
339 pslldq $8,%xmm9
340.byte 102,15,58,68,222,0
341 psrldq $8,%xmm8
342 pxor %xmm9,%xmm0
343 pxor %xmm8,%xmm1
344 movdqu 0(%rdx),%xmm8
345
346 movdqa %xmm0,%xmm9
347 psrlq $1,%xmm0
348.byte 102,15,58,68,238,17
349 xorps %xmm11,%xmm3
350 movdqu 16(%rdx),%xmm11
351.byte 102,69,15,56,0,218
352.byte 102,15,58,68,231,16
353 xorps %xmm13,%xmm5
354 movups 80(%rsi),%xmm7
355.byte 102,69,15,56,0,194
356 pxor %xmm9,%xmm1
357 pxor %xmm0,%xmm9
358 psrlq $5,%xmm0
359
360 movdqa %xmm11,%xmm13
361 pxor %xmm12,%xmm4
362 pshufd $78,%xmm11,%xmm12
363 pxor %xmm9,%xmm0
364 pxor %xmm8,%xmm1
365 pxor %xmm11,%xmm12
366.byte 102,69,15,58,68,222,0
367 psrlq $1,%xmm0
368 pxor %xmm1,%xmm0
369 movdqa %xmm0,%xmm1
370.byte 102,69,15,58,68,238,17
371 xorps %xmm11,%xmm3
372 pshufd $78,%xmm0,%xmm8
373 pxor %xmm0,%xmm8
374
375.byte 102,68,15,58,68,231,0
376 xorps %xmm13,%xmm5
377
378 leaq 64(%rdx),%rdx
379 subq $0x40,%rcx
380 jnc .Lmod4_loop
381
382.Ltail4x:
383.byte 102,65,15,58,68,199,0
384.byte 102,65,15,58,68,207,17
385.byte 102,68,15,58,68,199,16
386 xorps %xmm12,%xmm4
387 xorps %xmm3,%xmm0
388 xorps %xmm5,%xmm1
389 pxor %xmm0,%xmm1
390 pxor %xmm4,%xmm8
391
392 pxor %xmm1,%xmm8
393 pxor %xmm0,%xmm1
394
395 movdqa %xmm8,%xmm9
396 psrldq $8,%xmm8
397 pslldq $8,%xmm9
398 pxor %xmm8,%xmm1
399 pxor %xmm9,%xmm0
400
401 movdqa %xmm0,%xmm4
402 movdqa %xmm0,%xmm3
403 psllq $5,%xmm0
404 pxor %xmm0,%xmm3
405 psllq $1,%xmm0
406 pxor %xmm3,%xmm0
407 psllq $57,%xmm0
408 movdqa %xmm0,%xmm3
409 pslldq $8,%xmm0
410 psrldq $8,%xmm3
411 pxor %xmm4,%xmm0
412 pxor %xmm3,%xmm1
413
414
415 movdqa %xmm0,%xmm4
416 psrlq $1,%xmm0
417 pxor %xmm4,%xmm1
418 pxor %xmm0,%xmm4
419 psrlq $5,%xmm0
420 pxor %xmm4,%xmm0
421 psrlq $1,%xmm0
422 pxor %xmm1,%xmm0
423 addq $0x40,%rcx
424 jz .Ldone
425 movdqu 32(%rsi),%xmm7
426 subq $0x10,%rcx
427 jz .Lodd_tail
428.Lskip4x:
429
430
431
432
433
434 movdqu (%rdx),%xmm8
435 movdqu 16(%rdx),%xmm3
436.byte 102,69,15,56,0,194
437.byte 102,65,15,56,0,218
438 pxor %xmm8,%xmm0
439
440 movdqa %xmm3,%xmm5
441 pshufd $78,%xmm3,%xmm4
442 pxor %xmm3,%xmm4
443.byte 102,15,58,68,218,0
444.byte 102,15,58,68,234,17
445.byte 102,15,58,68,231,0
446
447 leaq 32(%rdx),%rdx
448 nop
449 subq $0x20,%rcx
450 jbe .Leven_tail
451 nop
452 jmp .Lmod_loop
453
454.align 32
455.Lmod_loop:
456 movdqa %xmm0,%xmm1
457 movdqa %xmm4,%xmm8
458 pshufd $78,%xmm0,%xmm4
459 pxor %xmm0,%xmm4
460
461.byte 102,15,58,68,198,0
462.byte 102,15,58,68,206,17
463.byte 102,15,58,68,231,16
464
465 pxor %xmm3,%xmm0
466 pxor %xmm5,%xmm1
467 movdqu (%rdx),%xmm9
468 pxor %xmm0,%xmm8
469.byte 102,69,15,56,0,202
470 movdqu 16(%rdx),%xmm3
471
472 pxor %xmm1,%xmm8
473 pxor %xmm9,%xmm1
474 pxor %xmm8,%xmm4
475.byte 102,65,15,56,0,218
476 movdqa %xmm4,%xmm8
477 psrldq $8,%xmm8
478 pslldq $8,%xmm4
479 pxor %xmm8,%xmm1
480 pxor %xmm4,%xmm0
481
482 movdqa %xmm3,%xmm5
483
484 movdqa %xmm0,%xmm9
485 movdqa %xmm0,%xmm8
486 psllq $5,%xmm0
487 pxor %xmm0,%xmm8
488.byte 102,15,58,68,218,0
489 psllq $1,%xmm0
490 pxor %xmm8,%xmm0
491 psllq $57,%xmm0
492 movdqa %xmm0,%xmm8
493 pslldq $8,%xmm0
494 psrldq $8,%xmm8
495 pxor %xmm9,%xmm0
496 pshufd $78,%xmm5,%xmm4
497 pxor %xmm8,%xmm1
498 pxor %xmm5,%xmm4
499
500 movdqa %xmm0,%xmm9
501 psrlq $1,%xmm0
502.byte 102,15,58,68,234,17
503 pxor %xmm9,%xmm1
504 pxor %xmm0,%xmm9
505 psrlq $5,%xmm0
506 pxor %xmm9,%xmm0
507 leaq 32(%rdx),%rdx
508 psrlq $1,%xmm0
509.byte 102,15,58,68,231,0
510 pxor %xmm1,%xmm0
511
512 subq $0x20,%rcx
513 ja .Lmod_loop
514
515.Leven_tail:
516 movdqa %xmm0,%xmm1
517 movdqa %xmm4,%xmm8
518 pshufd $78,%xmm0,%xmm4
519 pxor %xmm0,%xmm4
520
521.byte 102,15,58,68,198,0
522.byte 102,15,58,68,206,17
523.byte 102,15,58,68,231,16
524
525 pxor %xmm3,%xmm0
526 pxor %xmm5,%xmm1
527 pxor %xmm0,%xmm8
528 pxor %xmm1,%xmm8
529 pxor %xmm8,%xmm4
530 movdqa %xmm4,%xmm8
531 psrldq $8,%xmm8
532 pslldq $8,%xmm4
533 pxor %xmm8,%xmm1
534 pxor %xmm4,%xmm0
535
536 movdqa %xmm0,%xmm4
537 movdqa %xmm0,%xmm3
538 psllq $5,%xmm0
539 pxor %xmm0,%xmm3
540 psllq $1,%xmm0
541 pxor %xmm3,%xmm0
542 psllq $57,%xmm0
543 movdqa %xmm0,%xmm3
544 pslldq $8,%xmm0
545 psrldq $8,%xmm3
546 pxor %xmm4,%xmm0
547 pxor %xmm3,%xmm1
548
549
550 movdqa %xmm0,%xmm4
551 psrlq $1,%xmm0
552 pxor %xmm4,%xmm1
553 pxor %xmm0,%xmm4
554 psrlq $5,%xmm0
555 pxor %xmm4,%xmm0
556 psrlq $1,%xmm0
557 pxor %xmm1,%xmm0
558 testq %rcx,%rcx
559 jnz .Ldone
560
561.Lodd_tail:
562 movdqu (%rdx),%xmm8
563.byte 102,69,15,56,0,194
564 pxor %xmm8,%xmm0
565 movdqa %xmm0,%xmm1
566 pshufd $78,%xmm0,%xmm3
567 pxor %xmm0,%xmm3
568.byte 102,15,58,68,194,0
569.byte 102,15,58,68,202,17
570.byte 102,15,58,68,223,0
571 pxor %xmm0,%xmm3
572 pxor %xmm1,%xmm3
573
574 movdqa %xmm3,%xmm4
575 psrldq $8,%xmm3
576 pslldq $8,%xmm4
577 pxor %xmm3,%xmm1
578 pxor %xmm4,%xmm0
579
580 movdqa %xmm0,%xmm4
581 movdqa %xmm0,%xmm3
582 psllq $5,%xmm0
583 pxor %xmm0,%xmm3
584 psllq $1,%xmm0
585 pxor %xmm3,%xmm0
586 psllq $57,%xmm0
587 movdqa %xmm0,%xmm3
588 pslldq $8,%xmm0
589 psrldq $8,%xmm3
590 pxor %xmm4,%xmm0
591 pxor %xmm3,%xmm1
592
593
594 movdqa %xmm0,%xmm4
595 psrlq $1,%xmm0
596 pxor %xmm4,%xmm1
597 pxor %xmm0,%xmm4
598 psrlq $5,%xmm0
599 pxor %xmm4,%xmm0
600 psrlq $1,%xmm0
601 pxor %xmm1,%xmm0
602.Ldone:
603.byte 102,65,15,56,0,194
604 movdqu %xmm0,(%rdi)
605 ret
606.cfi_endproc
607
608.size gcm_ghash_clmul,.-gcm_ghash_clmul
609.globl gcm_init_avx
610.hidden gcm_init_avx
611.type gcm_init_avx,@function
612.align 32
613gcm_init_avx:
614.cfi_startproc
David Benjaminb6bca9c2024-04-04 00:50:59 -0400615
David Benjaminfe0c91e2024-03-18 15:37:24 +1000616_CET_ENDBR
617 vzeroupper
618
619 vmovdqu (%rsi),%xmm2
620 vpshufd $78,%xmm2,%xmm2
621
622
623 vpshufd $255,%xmm2,%xmm4
624 vpsrlq $63,%xmm2,%xmm3
625 vpsllq $1,%xmm2,%xmm2
626 vpxor %xmm5,%xmm5,%xmm5
627 vpcmpgtd %xmm4,%xmm5,%xmm5
628 vpslldq $8,%xmm3,%xmm3
629 vpor %xmm3,%xmm2,%xmm2
630
631
632 vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5
633 vpxor %xmm5,%xmm2,%xmm2
634
635 vpunpckhqdq %xmm2,%xmm2,%xmm6
636 vmovdqa %xmm2,%xmm0
637 vpxor %xmm2,%xmm6,%xmm6
638 movq $4,%r10
639 jmp .Linit_start_avx
640.align 32
641.Linit_loop_avx:
642 vpalignr $8,%xmm3,%xmm4,%xmm5
643 vmovdqu %xmm5,-16(%rdi)
644 vpunpckhqdq %xmm0,%xmm0,%xmm3
645 vpxor %xmm0,%xmm3,%xmm3
646 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
647 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
648 vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
649 vpxor %xmm0,%xmm1,%xmm4
650 vpxor %xmm4,%xmm3,%xmm3
651
652 vpslldq $8,%xmm3,%xmm4
653 vpsrldq $8,%xmm3,%xmm3
654 vpxor %xmm4,%xmm0,%xmm0
655 vpxor %xmm3,%xmm1,%xmm1
656 vpsllq $57,%xmm0,%xmm3
657 vpsllq $62,%xmm0,%xmm4
658 vpxor %xmm3,%xmm4,%xmm4
659 vpsllq $63,%xmm0,%xmm3
660 vpxor %xmm3,%xmm4,%xmm4
661 vpslldq $8,%xmm4,%xmm3
662 vpsrldq $8,%xmm4,%xmm4
663 vpxor %xmm3,%xmm0,%xmm0
664 vpxor %xmm4,%xmm1,%xmm1
665
666 vpsrlq $1,%xmm0,%xmm4
667 vpxor %xmm0,%xmm1,%xmm1
668 vpxor %xmm4,%xmm0,%xmm0
669 vpsrlq $5,%xmm4,%xmm4
670 vpxor %xmm4,%xmm0,%xmm0
671 vpsrlq $1,%xmm0,%xmm0
672 vpxor %xmm1,%xmm0,%xmm0
673.Linit_start_avx:
674 vmovdqa %xmm0,%xmm5
675 vpunpckhqdq %xmm0,%xmm0,%xmm3
676 vpxor %xmm0,%xmm3,%xmm3
677 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
678 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
679 vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
680 vpxor %xmm0,%xmm1,%xmm4
681 vpxor %xmm4,%xmm3,%xmm3
682
683 vpslldq $8,%xmm3,%xmm4
684 vpsrldq $8,%xmm3,%xmm3
685 vpxor %xmm4,%xmm0,%xmm0
686 vpxor %xmm3,%xmm1,%xmm1
687 vpsllq $57,%xmm0,%xmm3
688 vpsllq $62,%xmm0,%xmm4
689 vpxor %xmm3,%xmm4,%xmm4
690 vpsllq $63,%xmm0,%xmm3
691 vpxor %xmm3,%xmm4,%xmm4
692 vpslldq $8,%xmm4,%xmm3
693 vpsrldq $8,%xmm4,%xmm4
694 vpxor %xmm3,%xmm0,%xmm0
695 vpxor %xmm4,%xmm1,%xmm1
696
697 vpsrlq $1,%xmm0,%xmm4
698 vpxor %xmm0,%xmm1,%xmm1
699 vpxor %xmm4,%xmm0,%xmm0
700 vpsrlq $5,%xmm4,%xmm4
701 vpxor %xmm4,%xmm0,%xmm0
702 vpsrlq $1,%xmm0,%xmm0
703 vpxor %xmm1,%xmm0,%xmm0
704 vpshufd $78,%xmm5,%xmm3
705 vpshufd $78,%xmm0,%xmm4
706 vpxor %xmm5,%xmm3,%xmm3
707 vmovdqu %xmm5,0(%rdi)
708 vpxor %xmm0,%xmm4,%xmm4
709 vmovdqu %xmm0,16(%rdi)
710 leaq 48(%rdi),%rdi
711 subq $1,%r10
712 jnz .Linit_loop_avx
713
714 vpalignr $8,%xmm4,%xmm3,%xmm5
715 vmovdqu %xmm5,-16(%rdi)
716
717 vzeroupper
718 ret
719
720.cfi_endproc
721.size gcm_init_avx,.-gcm_init_avx
722.globl gcm_gmult_avx
723.hidden gcm_gmult_avx
724.type gcm_gmult_avx,@function
725.align 32
726gcm_gmult_avx:
727.cfi_startproc
728_CET_ENDBR
729 jmp .L_gmult_clmul
730.cfi_endproc
731.size gcm_gmult_avx,.-gcm_gmult_avx
732.globl gcm_ghash_avx
733.hidden gcm_ghash_avx
734.type gcm_ghash_avx,@function
735.align 32
736gcm_ghash_avx:
737.cfi_startproc
David Benjaminb6bca9c2024-04-04 00:50:59 -0400738
David Benjaminfe0c91e2024-03-18 15:37:24 +1000739_CET_ENDBR
740 vzeroupper
741
742 vmovdqu (%rdi),%xmm10
743 leaq .L0x1c2_polynomial(%rip),%r10
744 leaq 64(%rsi),%rsi
745 vmovdqu .Lbswap_mask(%rip),%xmm13
746 vpshufb %xmm13,%xmm10,%xmm10
747 cmpq $0x80,%rcx
748 jb .Lshort_avx
749 subq $0x80,%rcx
750
751 vmovdqu 112(%rdx),%xmm14
752 vmovdqu 0-64(%rsi),%xmm6
753 vpshufb %xmm13,%xmm14,%xmm14
754 vmovdqu 32-64(%rsi),%xmm7
755
756 vpunpckhqdq %xmm14,%xmm14,%xmm9
757 vmovdqu 96(%rdx),%xmm15
758 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
759 vpxor %xmm14,%xmm9,%xmm9
760 vpshufb %xmm13,%xmm15,%xmm15
761 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
762 vmovdqu 16-64(%rsi),%xmm6
763 vpunpckhqdq %xmm15,%xmm15,%xmm8
764 vmovdqu 80(%rdx),%xmm14
765 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
766 vpxor %xmm15,%xmm8,%xmm8
767
768 vpshufb %xmm13,%xmm14,%xmm14
769 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
770 vpunpckhqdq %xmm14,%xmm14,%xmm9
771 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
772 vmovdqu 48-64(%rsi),%xmm6
773 vpxor %xmm14,%xmm9,%xmm9
774 vmovdqu 64(%rdx),%xmm15
775 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
776 vmovdqu 80-64(%rsi),%xmm7
777
778 vpshufb %xmm13,%xmm15,%xmm15
779 vpxor %xmm0,%xmm3,%xmm3
780 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
781 vpxor %xmm1,%xmm4,%xmm4
782 vpunpckhqdq %xmm15,%xmm15,%xmm8
783 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
784 vmovdqu 64-64(%rsi),%xmm6
785 vpxor %xmm2,%xmm5,%xmm5
786 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
787 vpxor %xmm15,%xmm8,%xmm8
788
789 vmovdqu 48(%rdx),%xmm14
790 vpxor %xmm3,%xmm0,%xmm0
791 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
792 vpxor %xmm4,%xmm1,%xmm1
793 vpshufb %xmm13,%xmm14,%xmm14
794 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
795 vmovdqu 96-64(%rsi),%xmm6
796 vpxor %xmm5,%xmm2,%xmm2
797 vpunpckhqdq %xmm14,%xmm14,%xmm9
798 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
799 vmovdqu 128-64(%rsi),%xmm7
800 vpxor %xmm14,%xmm9,%xmm9
801
802 vmovdqu 32(%rdx),%xmm15
803 vpxor %xmm0,%xmm3,%xmm3
804 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
805 vpxor %xmm1,%xmm4,%xmm4
806 vpshufb %xmm13,%xmm15,%xmm15
807 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
808 vmovdqu 112-64(%rsi),%xmm6
809 vpxor %xmm2,%xmm5,%xmm5
810 vpunpckhqdq %xmm15,%xmm15,%xmm8
811 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
812 vpxor %xmm15,%xmm8,%xmm8
813
814 vmovdqu 16(%rdx),%xmm14
815 vpxor %xmm3,%xmm0,%xmm0
816 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
817 vpxor %xmm4,%xmm1,%xmm1
818 vpshufb %xmm13,%xmm14,%xmm14
819 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
820 vmovdqu 144-64(%rsi),%xmm6
821 vpxor %xmm5,%xmm2,%xmm2
822 vpunpckhqdq %xmm14,%xmm14,%xmm9
823 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
824 vmovdqu 176-64(%rsi),%xmm7
825 vpxor %xmm14,%xmm9,%xmm9
826
827 vmovdqu (%rdx),%xmm15
828 vpxor %xmm0,%xmm3,%xmm3
829 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
830 vpxor %xmm1,%xmm4,%xmm4
831 vpshufb %xmm13,%xmm15,%xmm15
832 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
833 vmovdqu 160-64(%rsi),%xmm6
834 vpxor %xmm2,%xmm5,%xmm5
835 vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
836
837 leaq 128(%rdx),%rdx
838 cmpq $0x80,%rcx
839 jb .Ltail_avx
840
841 vpxor %xmm10,%xmm15,%xmm15
842 subq $0x80,%rcx
843 jmp .Loop8x_avx
844
845.align 32
846.Loop8x_avx:
847 vpunpckhqdq %xmm15,%xmm15,%xmm8
848 vmovdqu 112(%rdx),%xmm14
849 vpxor %xmm0,%xmm3,%xmm3
850 vpxor %xmm15,%xmm8,%xmm8
851 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10
852 vpshufb %xmm13,%xmm14,%xmm14
853 vpxor %xmm1,%xmm4,%xmm4
854 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11
855 vmovdqu 0-64(%rsi),%xmm6
856 vpunpckhqdq %xmm14,%xmm14,%xmm9
857 vpxor %xmm2,%xmm5,%xmm5
858 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12
859 vmovdqu 32-64(%rsi),%xmm7
860 vpxor %xmm14,%xmm9,%xmm9
861
862 vmovdqu 96(%rdx),%xmm15
863 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
864 vpxor %xmm3,%xmm10,%xmm10
865 vpshufb %xmm13,%xmm15,%xmm15
866 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
867 vxorps %xmm4,%xmm11,%xmm11
868 vmovdqu 16-64(%rsi),%xmm6
869 vpunpckhqdq %xmm15,%xmm15,%xmm8
870 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
871 vpxor %xmm5,%xmm12,%xmm12
872 vxorps %xmm15,%xmm8,%xmm8
873
874 vmovdqu 80(%rdx),%xmm14
875 vpxor %xmm10,%xmm12,%xmm12
876 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
877 vpxor %xmm11,%xmm12,%xmm12
878 vpslldq $8,%xmm12,%xmm9
879 vpxor %xmm0,%xmm3,%xmm3
880 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
881 vpsrldq $8,%xmm12,%xmm12
882 vpxor %xmm9,%xmm10,%xmm10
883 vmovdqu 48-64(%rsi),%xmm6
884 vpshufb %xmm13,%xmm14,%xmm14
885 vxorps %xmm12,%xmm11,%xmm11
886 vpxor %xmm1,%xmm4,%xmm4
887 vpunpckhqdq %xmm14,%xmm14,%xmm9
888 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
889 vmovdqu 80-64(%rsi),%xmm7
890 vpxor %xmm14,%xmm9,%xmm9
891 vpxor %xmm2,%xmm5,%xmm5
892
893 vmovdqu 64(%rdx),%xmm15
894 vpalignr $8,%xmm10,%xmm10,%xmm12
895 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
896 vpshufb %xmm13,%xmm15,%xmm15
897 vpxor %xmm3,%xmm0,%xmm0
898 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
899 vmovdqu 64-64(%rsi),%xmm6
900 vpunpckhqdq %xmm15,%xmm15,%xmm8
901 vpxor %xmm4,%xmm1,%xmm1
902 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
903 vxorps %xmm15,%xmm8,%xmm8
904 vpxor %xmm5,%xmm2,%xmm2
905
906 vmovdqu 48(%rdx),%xmm14
907 vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
908 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
909 vpshufb %xmm13,%xmm14,%xmm14
910 vpxor %xmm0,%xmm3,%xmm3
911 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
912 vmovdqu 96-64(%rsi),%xmm6
913 vpunpckhqdq %xmm14,%xmm14,%xmm9
914 vpxor %xmm1,%xmm4,%xmm4
915 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
916 vmovdqu 128-64(%rsi),%xmm7
917 vpxor %xmm14,%xmm9,%xmm9
918 vpxor %xmm2,%xmm5,%xmm5
919
920 vmovdqu 32(%rdx),%xmm15
921 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
922 vpshufb %xmm13,%xmm15,%xmm15
923 vpxor %xmm3,%xmm0,%xmm0
924 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
925 vmovdqu 112-64(%rsi),%xmm6
926 vpunpckhqdq %xmm15,%xmm15,%xmm8
927 vpxor %xmm4,%xmm1,%xmm1
928 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
929 vpxor %xmm15,%xmm8,%xmm8
930 vpxor %xmm5,%xmm2,%xmm2
931 vxorps %xmm12,%xmm10,%xmm10
932
933 vmovdqu 16(%rdx),%xmm14
934 vpalignr $8,%xmm10,%xmm10,%xmm12
935 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
936 vpshufb %xmm13,%xmm14,%xmm14
937 vpxor %xmm0,%xmm3,%xmm3
938 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
939 vmovdqu 144-64(%rsi),%xmm6
940 vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
941 vxorps %xmm11,%xmm12,%xmm12
942 vpunpckhqdq %xmm14,%xmm14,%xmm9
943 vpxor %xmm1,%xmm4,%xmm4
944 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
945 vmovdqu 176-64(%rsi),%xmm7
946 vpxor %xmm14,%xmm9,%xmm9
947 vpxor %xmm2,%xmm5,%xmm5
948
949 vmovdqu (%rdx),%xmm15
950 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
951 vpshufb %xmm13,%xmm15,%xmm15
952 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
953 vmovdqu 160-64(%rsi),%xmm6
954 vpxor %xmm12,%xmm15,%xmm15
955 vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
956 vpxor %xmm10,%xmm15,%xmm15
957
958 leaq 128(%rdx),%rdx
959 subq $0x80,%rcx
960 jnc .Loop8x_avx
961
962 addq $0x80,%rcx
963 jmp .Ltail_no_xor_avx
964
965.align 32
966.Lshort_avx:
967 vmovdqu -16(%rdx,%rcx,1),%xmm14
968 leaq (%rdx,%rcx,1),%rdx
969 vmovdqu 0-64(%rsi),%xmm6
970 vmovdqu 32-64(%rsi),%xmm7
971 vpshufb %xmm13,%xmm14,%xmm15
972
973 vmovdqa %xmm0,%xmm3
974 vmovdqa %xmm1,%xmm4
975 vmovdqa %xmm2,%xmm5
976 subq $0x10,%rcx
977 jz .Ltail_avx
978
979 vpunpckhqdq %xmm15,%xmm15,%xmm8
980 vpxor %xmm0,%xmm3,%xmm3
981 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
982 vpxor %xmm15,%xmm8,%xmm8
983 vmovdqu -32(%rdx),%xmm14
984 vpxor %xmm1,%xmm4,%xmm4
985 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
986 vmovdqu 16-64(%rsi),%xmm6
987 vpshufb %xmm13,%xmm14,%xmm15
988 vpxor %xmm2,%xmm5,%xmm5
989 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
990 vpsrldq $8,%xmm7,%xmm7
991 subq $0x10,%rcx
992 jz .Ltail_avx
993
994 vpunpckhqdq %xmm15,%xmm15,%xmm8
995 vpxor %xmm0,%xmm3,%xmm3
996 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
997 vpxor %xmm15,%xmm8,%xmm8
998 vmovdqu -48(%rdx),%xmm14
999 vpxor %xmm1,%xmm4,%xmm4
1000 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
1001 vmovdqu 48-64(%rsi),%xmm6
1002 vpshufb %xmm13,%xmm14,%xmm15
1003 vpxor %xmm2,%xmm5,%xmm5
1004 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
1005 vmovdqu 80-64(%rsi),%xmm7
1006 subq $0x10,%rcx
1007 jz .Ltail_avx
1008
1009 vpunpckhqdq %xmm15,%xmm15,%xmm8
1010 vpxor %xmm0,%xmm3,%xmm3
1011 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
1012 vpxor %xmm15,%xmm8,%xmm8
1013 vmovdqu -64(%rdx),%xmm14
1014 vpxor %xmm1,%xmm4,%xmm4
1015 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
1016 vmovdqu 64-64(%rsi),%xmm6
1017 vpshufb %xmm13,%xmm14,%xmm15
1018 vpxor %xmm2,%xmm5,%xmm5
1019 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
1020 vpsrldq $8,%xmm7,%xmm7
1021 subq $0x10,%rcx
1022 jz .Ltail_avx
1023
1024 vpunpckhqdq %xmm15,%xmm15,%xmm8
1025 vpxor %xmm0,%xmm3,%xmm3
1026 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
1027 vpxor %xmm15,%xmm8,%xmm8
1028 vmovdqu -80(%rdx),%xmm14
1029 vpxor %xmm1,%xmm4,%xmm4
1030 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
1031 vmovdqu 96-64(%rsi),%xmm6
1032 vpshufb %xmm13,%xmm14,%xmm15
1033 vpxor %xmm2,%xmm5,%xmm5
1034 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
1035 vmovdqu 128-64(%rsi),%xmm7
1036 subq $0x10,%rcx
1037 jz .Ltail_avx
1038
1039 vpunpckhqdq %xmm15,%xmm15,%xmm8
1040 vpxor %xmm0,%xmm3,%xmm3
1041 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
1042 vpxor %xmm15,%xmm8,%xmm8
1043 vmovdqu -96(%rdx),%xmm14
1044 vpxor %xmm1,%xmm4,%xmm4
1045 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
1046 vmovdqu 112-64(%rsi),%xmm6
1047 vpshufb %xmm13,%xmm14,%xmm15
1048 vpxor %xmm2,%xmm5,%xmm5
1049 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
1050 vpsrldq $8,%xmm7,%xmm7
1051 subq $0x10,%rcx
1052 jz .Ltail_avx
1053
1054 vpunpckhqdq %xmm15,%xmm15,%xmm8
1055 vpxor %xmm0,%xmm3,%xmm3
1056 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
1057 vpxor %xmm15,%xmm8,%xmm8
1058 vmovdqu -112(%rdx),%xmm14
1059 vpxor %xmm1,%xmm4,%xmm4
1060 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
1061 vmovdqu 144-64(%rsi),%xmm6
1062 vpshufb %xmm13,%xmm14,%xmm15
1063 vpxor %xmm2,%xmm5,%xmm5
1064 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
1065 vmovq 184-64(%rsi),%xmm7
1066 subq $0x10,%rcx
1067 jmp .Ltail_avx
1068
1069.align 32
1070.Ltail_avx:
1071 vpxor %xmm10,%xmm15,%xmm15
1072.Ltail_no_xor_avx:
1073 vpunpckhqdq %xmm15,%xmm15,%xmm8
1074 vpxor %xmm0,%xmm3,%xmm3
1075 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
1076 vpxor %xmm15,%xmm8,%xmm8
1077 vpxor %xmm1,%xmm4,%xmm4
1078 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
1079 vpxor %xmm2,%xmm5,%xmm5
1080 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
1081
1082 vmovdqu (%r10),%xmm12
1083
1084 vpxor %xmm0,%xmm3,%xmm10
1085 vpxor %xmm1,%xmm4,%xmm11
1086 vpxor %xmm2,%xmm5,%xmm5
1087
1088 vpxor %xmm10,%xmm5,%xmm5
1089 vpxor %xmm11,%xmm5,%xmm5
1090 vpslldq $8,%xmm5,%xmm9
1091 vpsrldq $8,%xmm5,%xmm5
1092 vpxor %xmm9,%xmm10,%xmm10
1093 vpxor %xmm5,%xmm11,%xmm11
1094
1095 vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
1096 vpalignr $8,%xmm10,%xmm10,%xmm10
1097 vpxor %xmm9,%xmm10,%xmm10
1098
1099 vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
1100 vpalignr $8,%xmm10,%xmm10,%xmm10
1101 vpxor %xmm11,%xmm10,%xmm10
1102 vpxor %xmm9,%xmm10,%xmm10
1103
1104 cmpq $0,%rcx
1105 jne .Lshort_avx
1106
1107 vpshufb %xmm13,%xmm10,%xmm10
1108 vmovdqu %xmm10,(%rdi)
1109 vzeroupper
1110 ret
1111.cfi_endproc
1112
1113.size gcm_ghash_avx,.-gcm_ghash_avx
1114.section .rodata
1115.align 64
1116.Lbswap_mask:
1117.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1118.L0x1c2_polynomial:
1119.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1120.L7_mask:
1121.long 7,0,7,0
1122.align 64
1123
1124.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1125.align 64
1126.text
1127#endif