blob: 48d3cfc84fc6fbfeea8edabfc3d1f9981fa2be3d [file] [log] [blame]
David Benjaminfe0c91e2024-03-18 15:37:24 +10001// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <openssl/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
7.text
8
9.globl _aes_hw_encrypt
10.private_extern _aes_hw_encrypt
11
12.p2align 4
13_aes_hw_encrypt:
14
15_CET_ENDBR
16#ifdef BORINGSSL_DISPATCH_TEST
17
18 movb $1,_BORINGSSL_function_hit+1(%rip)
19#endif
20 movups (%rdi),%xmm2
21 movl 240(%rdx),%eax
22 movups (%rdx),%xmm0
23 movups 16(%rdx),%xmm1
24 leaq 32(%rdx),%rdx
25 xorps %xmm0,%xmm2
26L$oop_enc1_1:
27.byte 102,15,56,220,209
28 decl %eax
29 movups (%rdx),%xmm1
30 leaq 16(%rdx),%rdx
31 jnz L$oop_enc1_1
32.byte 102,15,56,221,209
33 pxor %xmm0,%xmm0
34 pxor %xmm1,%xmm1
35 movups %xmm2,(%rsi)
36 pxor %xmm2,%xmm2
37 ret
38
39
40
41.globl _aes_hw_decrypt
42.private_extern _aes_hw_decrypt
43
44.p2align 4
45_aes_hw_decrypt:
46
47_CET_ENDBR
48 movups (%rdi),%xmm2
49 movl 240(%rdx),%eax
50 movups (%rdx),%xmm0
51 movups 16(%rdx),%xmm1
52 leaq 32(%rdx),%rdx
53 xorps %xmm0,%xmm2
54L$oop_dec1_2:
55.byte 102,15,56,222,209
56 decl %eax
57 movups (%rdx),%xmm1
58 leaq 16(%rdx),%rdx
59 jnz L$oop_dec1_2
60.byte 102,15,56,223,209
61 pxor %xmm0,%xmm0
62 pxor %xmm1,%xmm1
63 movups %xmm2,(%rsi)
64 pxor %xmm2,%xmm2
65 ret
66
67
68
69.p2align 4
70_aesni_encrypt2:
71
72 movups (%rcx),%xmm0
73 shll $4,%eax
74 movups 16(%rcx),%xmm1
75 xorps %xmm0,%xmm2
76 xorps %xmm0,%xmm3
77 movups 32(%rcx),%xmm0
78 leaq 32(%rcx,%rax,1),%rcx
79 negq %rax
80 addq $16,%rax
81
82L$enc_loop2:
83.byte 102,15,56,220,209
84.byte 102,15,56,220,217
85 movups (%rcx,%rax,1),%xmm1
86 addq $32,%rax
87.byte 102,15,56,220,208
88.byte 102,15,56,220,216
89 movups -16(%rcx,%rax,1),%xmm0
90 jnz L$enc_loop2
91
92.byte 102,15,56,220,209
93.byte 102,15,56,220,217
94.byte 102,15,56,221,208
95.byte 102,15,56,221,216
96 ret
97
98
99
100.p2align 4
101_aesni_decrypt2:
102
103 movups (%rcx),%xmm0
104 shll $4,%eax
105 movups 16(%rcx),%xmm1
106 xorps %xmm0,%xmm2
107 xorps %xmm0,%xmm3
108 movups 32(%rcx),%xmm0
109 leaq 32(%rcx,%rax,1),%rcx
110 negq %rax
111 addq $16,%rax
112
113L$dec_loop2:
114.byte 102,15,56,222,209
115.byte 102,15,56,222,217
116 movups (%rcx,%rax,1),%xmm1
117 addq $32,%rax
118.byte 102,15,56,222,208
119.byte 102,15,56,222,216
120 movups -16(%rcx,%rax,1),%xmm0
121 jnz L$dec_loop2
122
123.byte 102,15,56,222,209
124.byte 102,15,56,222,217
125.byte 102,15,56,223,208
126.byte 102,15,56,223,216
127 ret
128
129
130
131.p2align 4
132_aesni_encrypt3:
133
134 movups (%rcx),%xmm0
135 shll $4,%eax
136 movups 16(%rcx),%xmm1
137 xorps %xmm0,%xmm2
138 xorps %xmm0,%xmm3
139 xorps %xmm0,%xmm4
140 movups 32(%rcx),%xmm0
141 leaq 32(%rcx,%rax,1),%rcx
142 negq %rax
143 addq $16,%rax
144
145L$enc_loop3:
146.byte 102,15,56,220,209
147.byte 102,15,56,220,217
148.byte 102,15,56,220,225
149 movups (%rcx,%rax,1),%xmm1
150 addq $32,%rax
151.byte 102,15,56,220,208
152.byte 102,15,56,220,216
153.byte 102,15,56,220,224
154 movups -16(%rcx,%rax,1),%xmm0
155 jnz L$enc_loop3
156
157.byte 102,15,56,220,209
158.byte 102,15,56,220,217
159.byte 102,15,56,220,225
160.byte 102,15,56,221,208
161.byte 102,15,56,221,216
162.byte 102,15,56,221,224
163 ret
164
165
166
167.p2align 4
168_aesni_decrypt3:
169
170 movups (%rcx),%xmm0
171 shll $4,%eax
172 movups 16(%rcx),%xmm1
173 xorps %xmm0,%xmm2
174 xorps %xmm0,%xmm3
175 xorps %xmm0,%xmm4
176 movups 32(%rcx),%xmm0
177 leaq 32(%rcx,%rax,1),%rcx
178 negq %rax
179 addq $16,%rax
180
181L$dec_loop3:
182.byte 102,15,56,222,209
183.byte 102,15,56,222,217
184.byte 102,15,56,222,225
185 movups (%rcx,%rax,1),%xmm1
186 addq $32,%rax
187.byte 102,15,56,222,208
188.byte 102,15,56,222,216
189.byte 102,15,56,222,224
190 movups -16(%rcx,%rax,1),%xmm0
191 jnz L$dec_loop3
192
193.byte 102,15,56,222,209
194.byte 102,15,56,222,217
195.byte 102,15,56,222,225
196.byte 102,15,56,223,208
197.byte 102,15,56,223,216
198.byte 102,15,56,223,224
199 ret
200
201
202
203.p2align 4
204_aesni_encrypt4:
205
206 movups (%rcx),%xmm0
207 shll $4,%eax
208 movups 16(%rcx),%xmm1
209 xorps %xmm0,%xmm2
210 xorps %xmm0,%xmm3
211 xorps %xmm0,%xmm4
212 xorps %xmm0,%xmm5
213 movups 32(%rcx),%xmm0
214 leaq 32(%rcx,%rax,1),%rcx
215 negq %rax
216.byte 0x0f,0x1f,0x00
217 addq $16,%rax
218
219L$enc_loop4:
220.byte 102,15,56,220,209
221.byte 102,15,56,220,217
222.byte 102,15,56,220,225
223.byte 102,15,56,220,233
224 movups (%rcx,%rax,1),%xmm1
225 addq $32,%rax
226.byte 102,15,56,220,208
227.byte 102,15,56,220,216
228.byte 102,15,56,220,224
229.byte 102,15,56,220,232
230 movups -16(%rcx,%rax,1),%xmm0
231 jnz L$enc_loop4
232
233.byte 102,15,56,220,209
234.byte 102,15,56,220,217
235.byte 102,15,56,220,225
236.byte 102,15,56,220,233
237.byte 102,15,56,221,208
238.byte 102,15,56,221,216
239.byte 102,15,56,221,224
240.byte 102,15,56,221,232
241 ret
242
243
244
245.p2align 4
246_aesni_decrypt4:
247
248 movups (%rcx),%xmm0
249 shll $4,%eax
250 movups 16(%rcx),%xmm1
251 xorps %xmm0,%xmm2
252 xorps %xmm0,%xmm3
253 xorps %xmm0,%xmm4
254 xorps %xmm0,%xmm5
255 movups 32(%rcx),%xmm0
256 leaq 32(%rcx,%rax,1),%rcx
257 negq %rax
258.byte 0x0f,0x1f,0x00
259 addq $16,%rax
260
261L$dec_loop4:
262.byte 102,15,56,222,209
263.byte 102,15,56,222,217
264.byte 102,15,56,222,225
265.byte 102,15,56,222,233
266 movups (%rcx,%rax,1),%xmm1
267 addq $32,%rax
268.byte 102,15,56,222,208
269.byte 102,15,56,222,216
270.byte 102,15,56,222,224
271.byte 102,15,56,222,232
272 movups -16(%rcx,%rax,1),%xmm0
273 jnz L$dec_loop4
274
275.byte 102,15,56,222,209
276.byte 102,15,56,222,217
277.byte 102,15,56,222,225
278.byte 102,15,56,222,233
279.byte 102,15,56,223,208
280.byte 102,15,56,223,216
281.byte 102,15,56,223,224
282.byte 102,15,56,223,232
283 ret
284
285
286
287.p2align 4
288_aesni_encrypt6:
289
290 movups (%rcx),%xmm0
291 shll $4,%eax
292 movups 16(%rcx),%xmm1
293 xorps %xmm0,%xmm2
294 pxor %xmm0,%xmm3
295 pxor %xmm0,%xmm4
296.byte 102,15,56,220,209
297 leaq 32(%rcx,%rax,1),%rcx
298 negq %rax
299.byte 102,15,56,220,217
300 pxor %xmm0,%xmm5
301 pxor %xmm0,%xmm6
302.byte 102,15,56,220,225
303 pxor %xmm0,%xmm7
304 movups (%rcx,%rax,1),%xmm0
305 addq $16,%rax
306 jmp L$enc_loop6_enter
307.p2align 4
308L$enc_loop6:
309.byte 102,15,56,220,209
310.byte 102,15,56,220,217
311.byte 102,15,56,220,225
312L$enc_loop6_enter:
313.byte 102,15,56,220,233
314.byte 102,15,56,220,241
315.byte 102,15,56,220,249
316 movups (%rcx,%rax,1),%xmm1
317 addq $32,%rax
318.byte 102,15,56,220,208
319.byte 102,15,56,220,216
320.byte 102,15,56,220,224
321.byte 102,15,56,220,232
322.byte 102,15,56,220,240
323.byte 102,15,56,220,248
324 movups -16(%rcx,%rax,1),%xmm0
325 jnz L$enc_loop6
326
327.byte 102,15,56,220,209
328.byte 102,15,56,220,217
329.byte 102,15,56,220,225
330.byte 102,15,56,220,233
331.byte 102,15,56,220,241
332.byte 102,15,56,220,249
333.byte 102,15,56,221,208
334.byte 102,15,56,221,216
335.byte 102,15,56,221,224
336.byte 102,15,56,221,232
337.byte 102,15,56,221,240
338.byte 102,15,56,221,248
339 ret
340
341
342
343.p2align 4
344_aesni_decrypt6:
345
346 movups (%rcx),%xmm0
347 shll $4,%eax
348 movups 16(%rcx),%xmm1
349 xorps %xmm0,%xmm2
350 pxor %xmm0,%xmm3
351 pxor %xmm0,%xmm4
352.byte 102,15,56,222,209
353 leaq 32(%rcx,%rax,1),%rcx
354 negq %rax
355.byte 102,15,56,222,217
356 pxor %xmm0,%xmm5
357 pxor %xmm0,%xmm6
358.byte 102,15,56,222,225
359 pxor %xmm0,%xmm7
360 movups (%rcx,%rax,1),%xmm0
361 addq $16,%rax
362 jmp L$dec_loop6_enter
363.p2align 4
364L$dec_loop6:
365.byte 102,15,56,222,209
366.byte 102,15,56,222,217
367.byte 102,15,56,222,225
368L$dec_loop6_enter:
369.byte 102,15,56,222,233
370.byte 102,15,56,222,241
371.byte 102,15,56,222,249
372 movups (%rcx,%rax,1),%xmm1
373 addq $32,%rax
374.byte 102,15,56,222,208
375.byte 102,15,56,222,216
376.byte 102,15,56,222,224
377.byte 102,15,56,222,232
378.byte 102,15,56,222,240
379.byte 102,15,56,222,248
380 movups -16(%rcx,%rax,1),%xmm0
381 jnz L$dec_loop6
382
383.byte 102,15,56,222,209
384.byte 102,15,56,222,217
385.byte 102,15,56,222,225
386.byte 102,15,56,222,233
387.byte 102,15,56,222,241
388.byte 102,15,56,222,249
389.byte 102,15,56,223,208
390.byte 102,15,56,223,216
391.byte 102,15,56,223,224
392.byte 102,15,56,223,232
393.byte 102,15,56,223,240
394.byte 102,15,56,223,248
395 ret
396
397
398
399.p2align 4
400_aesni_encrypt8:
401
402 movups (%rcx),%xmm0
403 shll $4,%eax
404 movups 16(%rcx),%xmm1
405 xorps %xmm0,%xmm2
406 xorps %xmm0,%xmm3
407 pxor %xmm0,%xmm4
408 pxor %xmm0,%xmm5
409 pxor %xmm0,%xmm6
410 leaq 32(%rcx,%rax,1),%rcx
411 negq %rax
412.byte 102,15,56,220,209
413 pxor %xmm0,%xmm7
414 pxor %xmm0,%xmm8
415.byte 102,15,56,220,217
416 pxor %xmm0,%xmm9
417 movups (%rcx,%rax,1),%xmm0
418 addq $16,%rax
419 jmp L$enc_loop8_inner
420.p2align 4
421L$enc_loop8:
422.byte 102,15,56,220,209
423.byte 102,15,56,220,217
424L$enc_loop8_inner:
425.byte 102,15,56,220,225
426.byte 102,15,56,220,233
427.byte 102,15,56,220,241
428.byte 102,15,56,220,249
429.byte 102,68,15,56,220,193
430.byte 102,68,15,56,220,201
431L$enc_loop8_enter:
432 movups (%rcx,%rax,1),%xmm1
433 addq $32,%rax
434.byte 102,15,56,220,208
435.byte 102,15,56,220,216
436.byte 102,15,56,220,224
437.byte 102,15,56,220,232
438.byte 102,15,56,220,240
439.byte 102,15,56,220,248
440.byte 102,68,15,56,220,192
441.byte 102,68,15,56,220,200
442 movups -16(%rcx,%rax,1),%xmm0
443 jnz L$enc_loop8
444
445.byte 102,15,56,220,209
446.byte 102,15,56,220,217
447.byte 102,15,56,220,225
448.byte 102,15,56,220,233
449.byte 102,15,56,220,241
450.byte 102,15,56,220,249
451.byte 102,68,15,56,220,193
452.byte 102,68,15,56,220,201
453.byte 102,15,56,221,208
454.byte 102,15,56,221,216
455.byte 102,15,56,221,224
456.byte 102,15,56,221,232
457.byte 102,15,56,221,240
458.byte 102,15,56,221,248
459.byte 102,68,15,56,221,192
460.byte 102,68,15,56,221,200
461 ret
462
463
464
465.p2align 4
466_aesni_decrypt8:
467
468 movups (%rcx),%xmm0
469 shll $4,%eax
470 movups 16(%rcx),%xmm1
471 xorps %xmm0,%xmm2
472 xorps %xmm0,%xmm3
473 pxor %xmm0,%xmm4
474 pxor %xmm0,%xmm5
475 pxor %xmm0,%xmm6
476 leaq 32(%rcx,%rax,1),%rcx
477 negq %rax
478.byte 102,15,56,222,209
479 pxor %xmm0,%xmm7
480 pxor %xmm0,%xmm8
481.byte 102,15,56,222,217
482 pxor %xmm0,%xmm9
483 movups (%rcx,%rax,1),%xmm0
484 addq $16,%rax
485 jmp L$dec_loop8_inner
486.p2align 4
487L$dec_loop8:
488.byte 102,15,56,222,209
489.byte 102,15,56,222,217
490L$dec_loop8_inner:
491.byte 102,15,56,222,225
492.byte 102,15,56,222,233
493.byte 102,15,56,222,241
494.byte 102,15,56,222,249
495.byte 102,68,15,56,222,193
496.byte 102,68,15,56,222,201
497L$dec_loop8_enter:
498 movups (%rcx,%rax,1),%xmm1
499 addq $32,%rax
500.byte 102,15,56,222,208
501.byte 102,15,56,222,216
502.byte 102,15,56,222,224
503.byte 102,15,56,222,232
504.byte 102,15,56,222,240
505.byte 102,15,56,222,248
506.byte 102,68,15,56,222,192
507.byte 102,68,15,56,222,200
508 movups -16(%rcx,%rax,1),%xmm0
509 jnz L$dec_loop8
510
511.byte 102,15,56,222,209
512.byte 102,15,56,222,217
513.byte 102,15,56,222,225
514.byte 102,15,56,222,233
515.byte 102,15,56,222,241
516.byte 102,15,56,222,249
517.byte 102,68,15,56,222,193
518.byte 102,68,15,56,222,201
519.byte 102,15,56,223,208
520.byte 102,15,56,223,216
521.byte 102,15,56,223,224
522.byte 102,15,56,223,232
523.byte 102,15,56,223,240
524.byte 102,15,56,223,248
525.byte 102,68,15,56,223,192
526.byte 102,68,15,56,223,200
527 ret
528
529
530.globl _aes_hw_ecb_encrypt
531.private_extern _aes_hw_ecb_encrypt
532
533.p2align 4
534_aes_hw_ecb_encrypt:
535
536_CET_ENDBR
537 andq $-16,%rdx
538 jz L$ecb_ret
539
540 movl 240(%rcx),%eax
541 movups (%rcx),%xmm0
542 movq %rcx,%r11
543 movl %eax,%r10d
544 testl %r8d,%r8d
545 jz L$ecb_decrypt
546
547 cmpq $0x80,%rdx
548 jb L$ecb_enc_tail
549
550 movdqu (%rdi),%xmm2
551 movdqu 16(%rdi),%xmm3
552 movdqu 32(%rdi),%xmm4
553 movdqu 48(%rdi),%xmm5
554 movdqu 64(%rdi),%xmm6
555 movdqu 80(%rdi),%xmm7
556 movdqu 96(%rdi),%xmm8
557 movdqu 112(%rdi),%xmm9
558 leaq 128(%rdi),%rdi
559 subq $0x80,%rdx
560 jmp L$ecb_enc_loop8_enter
561.p2align 4
562L$ecb_enc_loop8:
563 movups %xmm2,(%rsi)
564 movq %r11,%rcx
565 movdqu (%rdi),%xmm2
566 movl %r10d,%eax
567 movups %xmm3,16(%rsi)
568 movdqu 16(%rdi),%xmm3
569 movups %xmm4,32(%rsi)
570 movdqu 32(%rdi),%xmm4
571 movups %xmm5,48(%rsi)
572 movdqu 48(%rdi),%xmm5
573 movups %xmm6,64(%rsi)
574 movdqu 64(%rdi),%xmm6
575 movups %xmm7,80(%rsi)
576 movdqu 80(%rdi),%xmm7
577 movups %xmm8,96(%rsi)
578 movdqu 96(%rdi),%xmm8
579 movups %xmm9,112(%rsi)
580 leaq 128(%rsi),%rsi
581 movdqu 112(%rdi),%xmm9
582 leaq 128(%rdi),%rdi
583L$ecb_enc_loop8_enter:
584
585 call _aesni_encrypt8
586
587 subq $0x80,%rdx
588 jnc L$ecb_enc_loop8
589
590 movups %xmm2,(%rsi)
591 movq %r11,%rcx
592 movups %xmm3,16(%rsi)
593 movl %r10d,%eax
594 movups %xmm4,32(%rsi)
595 movups %xmm5,48(%rsi)
596 movups %xmm6,64(%rsi)
597 movups %xmm7,80(%rsi)
598 movups %xmm8,96(%rsi)
599 movups %xmm9,112(%rsi)
600 leaq 128(%rsi),%rsi
601 addq $0x80,%rdx
602 jz L$ecb_ret
603
604L$ecb_enc_tail:
605 movups (%rdi),%xmm2
606 cmpq $0x20,%rdx
607 jb L$ecb_enc_one
608 movups 16(%rdi),%xmm3
609 je L$ecb_enc_two
610 movups 32(%rdi),%xmm4
611 cmpq $0x40,%rdx
612 jb L$ecb_enc_three
613 movups 48(%rdi),%xmm5
614 je L$ecb_enc_four
615 movups 64(%rdi),%xmm6
616 cmpq $0x60,%rdx
617 jb L$ecb_enc_five
618 movups 80(%rdi),%xmm7
619 je L$ecb_enc_six
620 movdqu 96(%rdi),%xmm8
621 xorps %xmm9,%xmm9
622 call _aesni_encrypt8
623 movups %xmm2,(%rsi)
624 movups %xmm3,16(%rsi)
625 movups %xmm4,32(%rsi)
626 movups %xmm5,48(%rsi)
627 movups %xmm6,64(%rsi)
628 movups %xmm7,80(%rsi)
629 movups %xmm8,96(%rsi)
630 jmp L$ecb_ret
631.p2align 4
632L$ecb_enc_one:
633 movups (%rcx),%xmm0
634 movups 16(%rcx),%xmm1
635 leaq 32(%rcx),%rcx
636 xorps %xmm0,%xmm2
637L$oop_enc1_3:
638.byte 102,15,56,220,209
639 decl %eax
640 movups (%rcx),%xmm1
641 leaq 16(%rcx),%rcx
642 jnz L$oop_enc1_3
643.byte 102,15,56,221,209
644 movups %xmm2,(%rsi)
645 jmp L$ecb_ret
646.p2align 4
647L$ecb_enc_two:
648 call _aesni_encrypt2
649 movups %xmm2,(%rsi)
650 movups %xmm3,16(%rsi)
651 jmp L$ecb_ret
652.p2align 4
653L$ecb_enc_three:
654 call _aesni_encrypt3
655 movups %xmm2,(%rsi)
656 movups %xmm3,16(%rsi)
657 movups %xmm4,32(%rsi)
658 jmp L$ecb_ret
659.p2align 4
660L$ecb_enc_four:
661 call _aesni_encrypt4
662 movups %xmm2,(%rsi)
663 movups %xmm3,16(%rsi)
664 movups %xmm4,32(%rsi)
665 movups %xmm5,48(%rsi)
666 jmp L$ecb_ret
667.p2align 4
668L$ecb_enc_five:
669 xorps %xmm7,%xmm7
670 call _aesni_encrypt6
671 movups %xmm2,(%rsi)
672 movups %xmm3,16(%rsi)
673 movups %xmm4,32(%rsi)
674 movups %xmm5,48(%rsi)
675 movups %xmm6,64(%rsi)
676 jmp L$ecb_ret
677.p2align 4
678L$ecb_enc_six:
679 call _aesni_encrypt6
680 movups %xmm2,(%rsi)
681 movups %xmm3,16(%rsi)
682 movups %xmm4,32(%rsi)
683 movups %xmm5,48(%rsi)
684 movups %xmm6,64(%rsi)
685 movups %xmm7,80(%rsi)
686 jmp L$ecb_ret
687
688.p2align 4
689L$ecb_decrypt:
690 cmpq $0x80,%rdx
691 jb L$ecb_dec_tail
692
693 movdqu (%rdi),%xmm2
694 movdqu 16(%rdi),%xmm3
695 movdqu 32(%rdi),%xmm4
696 movdqu 48(%rdi),%xmm5
697 movdqu 64(%rdi),%xmm6
698 movdqu 80(%rdi),%xmm7
699 movdqu 96(%rdi),%xmm8
700 movdqu 112(%rdi),%xmm9
701 leaq 128(%rdi),%rdi
702 subq $0x80,%rdx
703 jmp L$ecb_dec_loop8_enter
704.p2align 4
705L$ecb_dec_loop8:
706 movups %xmm2,(%rsi)
707 movq %r11,%rcx
708 movdqu (%rdi),%xmm2
709 movl %r10d,%eax
710 movups %xmm3,16(%rsi)
711 movdqu 16(%rdi),%xmm3
712 movups %xmm4,32(%rsi)
713 movdqu 32(%rdi),%xmm4
714 movups %xmm5,48(%rsi)
715 movdqu 48(%rdi),%xmm5
716 movups %xmm6,64(%rsi)
717 movdqu 64(%rdi),%xmm6
718 movups %xmm7,80(%rsi)
719 movdqu 80(%rdi),%xmm7
720 movups %xmm8,96(%rsi)
721 movdqu 96(%rdi),%xmm8
722 movups %xmm9,112(%rsi)
723 leaq 128(%rsi),%rsi
724 movdqu 112(%rdi),%xmm9
725 leaq 128(%rdi),%rdi
726L$ecb_dec_loop8_enter:
727
728 call _aesni_decrypt8
729
730 movups (%r11),%xmm0
731 subq $0x80,%rdx
732 jnc L$ecb_dec_loop8
733
734 movups %xmm2,(%rsi)
735 pxor %xmm2,%xmm2
736 movq %r11,%rcx
737 movups %xmm3,16(%rsi)
738 pxor %xmm3,%xmm3
739 movl %r10d,%eax
740 movups %xmm4,32(%rsi)
741 pxor %xmm4,%xmm4
742 movups %xmm5,48(%rsi)
743 pxor %xmm5,%xmm5
744 movups %xmm6,64(%rsi)
745 pxor %xmm6,%xmm6
746 movups %xmm7,80(%rsi)
747 pxor %xmm7,%xmm7
748 movups %xmm8,96(%rsi)
749 pxor %xmm8,%xmm8
750 movups %xmm9,112(%rsi)
751 pxor %xmm9,%xmm9
752 leaq 128(%rsi),%rsi
753 addq $0x80,%rdx
754 jz L$ecb_ret
755
756L$ecb_dec_tail:
757 movups (%rdi),%xmm2
758 cmpq $0x20,%rdx
759 jb L$ecb_dec_one
760 movups 16(%rdi),%xmm3
761 je L$ecb_dec_two
762 movups 32(%rdi),%xmm4
763 cmpq $0x40,%rdx
764 jb L$ecb_dec_three
765 movups 48(%rdi),%xmm5
766 je L$ecb_dec_four
767 movups 64(%rdi),%xmm6
768 cmpq $0x60,%rdx
769 jb L$ecb_dec_five
770 movups 80(%rdi),%xmm7
771 je L$ecb_dec_six
772 movups 96(%rdi),%xmm8
773 movups (%rcx),%xmm0
774 xorps %xmm9,%xmm9
775 call _aesni_decrypt8
776 movups %xmm2,(%rsi)
777 pxor %xmm2,%xmm2
778 movups %xmm3,16(%rsi)
779 pxor %xmm3,%xmm3
780 movups %xmm4,32(%rsi)
781 pxor %xmm4,%xmm4
782 movups %xmm5,48(%rsi)
783 pxor %xmm5,%xmm5
784 movups %xmm6,64(%rsi)
785 pxor %xmm6,%xmm6
786 movups %xmm7,80(%rsi)
787 pxor %xmm7,%xmm7
788 movups %xmm8,96(%rsi)
789 pxor %xmm8,%xmm8
790 pxor %xmm9,%xmm9
791 jmp L$ecb_ret
792.p2align 4
793L$ecb_dec_one:
794 movups (%rcx),%xmm0
795 movups 16(%rcx),%xmm1
796 leaq 32(%rcx),%rcx
797 xorps %xmm0,%xmm2
798L$oop_dec1_4:
799.byte 102,15,56,222,209
800 decl %eax
801 movups (%rcx),%xmm1
802 leaq 16(%rcx),%rcx
803 jnz L$oop_dec1_4
804.byte 102,15,56,223,209
805 movups %xmm2,(%rsi)
806 pxor %xmm2,%xmm2
807 jmp L$ecb_ret
808.p2align 4
809L$ecb_dec_two:
810 call _aesni_decrypt2
811 movups %xmm2,(%rsi)
812 pxor %xmm2,%xmm2
813 movups %xmm3,16(%rsi)
814 pxor %xmm3,%xmm3
815 jmp L$ecb_ret
816.p2align 4
817L$ecb_dec_three:
818 call _aesni_decrypt3
819 movups %xmm2,(%rsi)
820 pxor %xmm2,%xmm2
821 movups %xmm3,16(%rsi)
822 pxor %xmm3,%xmm3
823 movups %xmm4,32(%rsi)
824 pxor %xmm4,%xmm4
825 jmp L$ecb_ret
826.p2align 4
827L$ecb_dec_four:
828 call _aesni_decrypt4
829 movups %xmm2,(%rsi)
830 pxor %xmm2,%xmm2
831 movups %xmm3,16(%rsi)
832 pxor %xmm3,%xmm3
833 movups %xmm4,32(%rsi)
834 pxor %xmm4,%xmm4
835 movups %xmm5,48(%rsi)
836 pxor %xmm5,%xmm5
837 jmp L$ecb_ret
838.p2align 4
839L$ecb_dec_five:
840 xorps %xmm7,%xmm7
841 call _aesni_decrypt6
842 movups %xmm2,(%rsi)
843 pxor %xmm2,%xmm2
844 movups %xmm3,16(%rsi)
845 pxor %xmm3,%xmm3
846 movups %xmm4,32(%rsi)
847 pxor %xmm4,%xmm4
848 movups %xmm5,48(%rsi)
849 pxor %xmm5,%xmm5
850 movups %xmm6,64(%rsi)
851 pxor %xmm6,%xmm6
852 pxor %xmm7,%xmm7
853 jmp L$ecb_ret
854.p2align 4
855L$ecb_dec_six:
856 call _aesni_decrypt6
857 movups %xmm2,(%rsi)
858 pxor %xmm2,%xmm2
859 movups %xmm3,16(%rsi)
860 pxor %xmm3,%xmm3
861 movups %xmm4,32(%rsi)
862 pxor %xmm4,%xmm4
863 movups %xmm5,48(%rsi)
864 pxor %xmm5,%xmm5
865 movups %xmm6,64(%rsi)
866 pxor %xmm6,%xmm6
867 movups %xmm7,80(%rsi)
868 pxor %xmm7,%xmm7
869
870L$ecb_ret:
871 xorps %xmm0,%xmm0
872 pxor %xmm1,%xmm1
873 ret
874
875
876.globl _aes_hw_ctr32_encrypt_blocks
877.private_extern _aes_hw_ctr32_encrypt_blocks
878
879.p2align 4
880_aes_hw_ctr32_encrypt_blocks:
881
882_CET_ENDBR
883#ifdef BORINGSSL_DISPATCH_TEST
884 movb $1,_BORINGSSL_function_hit(%rip)
885#endif
886 cmpq $1,%rdx
887 jne L$ctr32_bulk
888
889
890
891 movups (%r8),%xmm2
892 movups (%rdi),%xmm3
893 movl 240(%rcx),%edx
894 movups (%rcx),%xmm0
895 movups 16(%rcx),%xmm1
896 leaq 32(%rcx),%rcx
897 xorps %xmm0,%xmm2
898L$oop_enc1_5:
899.byte 102,15,56,220,209
900 decl %edx
901 movups (%rcx),%xmm1
902 leaq 16(%rcx),%rcx
903 jnz L$oop_enc1_5
904.byte 102,15,56,221,209
905 pxor %xmm0,%xmm0
906 pxor %xmm1,%xmm1
907 xorps %xmm3,%xmm2
908 pxor %xmm3,%xmm3
909 movups %xmm2,(%rsi)
910 xorps %xmm2,%xmm2
911 jmp L$ctr32_epilogue
912
913.p2align 4
914L$ctr32_bulk:
915 leaq (%rsp),%r11
916
917 pushq %rbp
918
919 subq $128,%rsp
920 andq $-16,%rsp
921
922
923
924
925 movdqu (%r8),%xmm2
926 movdqu (%rcx),%xmm0
927 movl 12(%r8),%r8d
928 pxor %xmm0,%xmm2
929 movl 12(%rcx),%ebp
930 movdqa %xmm2,0(%rsp)
931 bswapl %r8d
932 movdqa %xmm2,%xmm3
933 movdqa %xmm2,%xmm4
934 movdqa %xmm2,%xmm5
935 movdqa %xmm2,64(%rsp)
936 movdqa %xmm2,80(%rsp)
937 movdqa %xmm2,96(%rsp)
938 movq %rdx,%r10
939 movdqa %xmm2,112(%rsp)
940
941 leaq 1(%r8),%rax
942 leaq 2(%r8),%rdx
943 bswapl %eax
944 bswapl %edx
945 xorl %ebp,%eax
946 xorl %ebp,%edx
947.byte 102,15,58,34,216,3
948 leaq 3(%r8),%rax
949 movdqa %xmm3,16(%rsp)
950.byte 102,15,58,34,226,3
951 bswapl %eax
952 movq %r10,%rdx
953 leaq 4(%r8),%r10
954 movdqa %xmm4,32(%rsp)
955 xorl %ebp,%eax
956 bswapl %r10d
957.byte 102,15,58,34,232,3
958 xorl %ebp,%r10d
959 movdqa %xmm5,48(%rsp)
960 leaq 5(%r8),%r9
961 movl %r10d,64+12(%rsp)
962 bswapl %r9d
963 leaq 6(%r8),%r10
964 movl 240(%rcx),%eax
965 xorl %ebp,%r9d
966 bswapl %r10d
967 movl %r9d,80+12(%rsp)
968 xorl %ebp,%r10d
969 leaq 7(%r8),%r9
970 movl %r10d,96+12(%rsp)
971 bswapl %r9d
972 xorl %ebp,%r9d
973 movl %r9d,112+12(%rsp)
974
975 movups 16(%rcx),%xmm1
976
977 movdqa 64(%rsp),%xmm6
978 movdqa 80(%rsp),%xmm7
979
980 cmpq $8,%rdx
981 jb L$ctr32_tail
982
983 leaq 128(%rcx),%rcx
984 subq $8,%rdx
985 jmp L$ctr32_loop8
986
987.p2align 5
988L$ctr32_loop8:
989 addl $8,%r8d
990 movdqa 96(%rsp),%xmm8
991.byte 102,15,56,220,209
992 movl %r8d,%r9d
993 movdqa 112(%rsp),%xmm9
994.byte 102,15,56,220,217
995 bswapl %r9d
996 movups 32-128(%rcx),%xmm0
997.byte 102,15,56,220,225
998 xorl %ebp,%r9d
999 nop
1000.byte 102,15,56,220,233
1001 movl %r9d,0+12(%rsp)
1002 leaq 1(%r8),%r9
1003.byte 102,15,56,220,241
1004.byte 102,15,56,220,249
1005.byte 102,68,15,56,220,193
1006.byte 102,68,15,56,220,201
1007 movups 48-128(%rcx),%xmm1
1008 bswapl %r9d
1009.byte 102,15,56,220,208
1010.byte 102,15,56,220,216
1011 xorl %ebp,%r9d
1012.byte 0x66,0x90
1013.byte 102,15,56,220,224
1014.byte 102,15,56,220,232
1015 movl %r9d,16+12(%rsp)
1016 leaq 2(%r8),%r9
1017.byte 102,15,56,220,240
1018.byte 102,15,56,220,248
1019.byte 102,68,15,56,220,192
1020.byte 102,68,15,56,220,200
1021 movups 64-128(%rcx),%xmm0
1022 bswapl %r9d
1023.byte 102,15,56,220,209
1024.byte 102,15,56,220,217
1025 xorl %ebp,%r9d
1026.byte 0x66,0x90
1027.byte 102,15,56,220,225
1028.byte 102,15,56,220,233
1029 movl %r9d,32+12(%rsp)
1030 leaq 3(%r8),%r9
1031.byte 102,15,56,220,241
1032.byte 102,15,56,220,249
1033.byte 102,68,15,56,220,193
1034.byte 102,68,15,56,220,201
1035 movups 80-128(%rcx),%xmm1
1036 bswapl %r9d
1037.byte 102,15,56,220,208
1038.byte 102,15,56,220,216
1039 xorl %ebp,%r9d
1040.byte 0x66,0x90
1041.byte 102,15,56,220,224
1042.byte 102,15,56,220,232
1043 movl %r9d,48+12(%rsp)
1044 leaq 4(%r8),%r9
1045.byte 102,15,56,220,240
1046.byte 102,15,56,220,248
1047.byte 102,68,15,56,220,192
1048.byte 102,68,15,56,220,200
1049 movups 96-128(%rcx),%xmm0
1050 bswapl %r9d
1051.byte 102,15,56,220,209
1052.byte 102,15,56,220,217
1053 xorl %ebp,%r9d
1054.byte 0x66,0x90
1055.byte 102,15,56,220,225
1056.byte 102,15,56,220,233
1057 movl %r9d,64+12(%rsp)
1058 leaq 5(%r8),%r9
1059.byte 102,15,56,220,241
1060.byte 102,15,56,220,249
1061.byte 102,68,15,56,220,193
1062.byte 102,68,15,56,220,201
1063 movups 112-128(%rcx),%xmm1
1064 bswapl %r9d
1065.byte 102,15,56,220,208
1066.byte 102,15,56,220,216
1067 xorl %ebp,%r9d
1068.byte 0x66,0x90
1069.byte 102,15,56,220,224
1070.byte 102,15,56,220,232
1071 movl %r9d,80+12(%rsp)
1072 leaq 6(%r8),%r9
1073.byte 102,15,56,220,240
1074.byte 102,15,56,220,248
1075.byte 102,68,15,56,220,192
1076.byte 102,68,15,56,220,200
1077 movups 128-128(%rcx),%xmm0
1078 bswapl %r9d
1079.byte 102,15,56,220,209
1080.byte 102,15,56,220,217
1081 xorl %ebp,%r9d
1082.byte 0x66,0x90
1083.byte 102,15,56,220,225
1084.byte 102,15,56,220,233
1085 movl %r9d,96+12(%rsp)
1086 leaq 7(%r8),%r9
1087.byte 102,15,56,220,241
1088.byte 102,15,56,220,249
1089.byte 102,68,15,56,220,193
1090.byte 102,68,15,56,220,201
1091 movups 144-128(%rcx),%xmm1
1092 bswapl %r9d
1093.byte 102,15,56,220,208
1094.byte 102,15,56,220,216
1095.byte 102,15,56,220,224
1096 xorl %ebp,%r9d
1097 movdqu 0(%rdi),%xmm10
1098.byte 102,15,56,220,232
1099 movl %r9d,112+12(%rsp)
1100 cmpl $11,%eax
1101.byte 102,15,56,220,240
1102.byte 102,15,56,220,248
1103.byte 102,68,15,56,220,192
1104.byte 102,68,15,56,220,200
1105 movups 160-128(%rcx),%xmm0
1106
1107 jb L$ctr32_enc_done
1108
1109.byte 102,15,56,220,209
1110.byte 102,15,56,220,217
1111.byte 102,15,56,220,225
1112.byte 102,15,56,220,233
1113.byte 102,15,56,220,241
1114.byte 102,15,56,220,249
1115.byte 102,68,15,56,220,193
1116.byte 102,68,15,56,220,201
1117 movups 176-128(%rcx),%xmm1
1118
1119.byte 102,15,56,220,208
1120.byte 102,15,56,220,216
1121.byte 102,15,56,220,224
1122.byte 102,15,56,220,232
1123.byte 102,15,56,220,240
1124.byte 102,15,56,220,248
1125.byte 102,68,15,56,220,192
1126.byte 102,68,15,56,220,200
1127 movups 192-128(%rcx),%xmm0
1128 je L$ctr32_enc_done
1129
1130.byte 102,15,56,220,209
1131.byte 102,15,56,220,217
1132.byte 102,15,56,220,225
1133.byte 102,15,56,220,233
1134.byte 102,15,56,220,241
1135.byte 102,15,56,220,249
1136.byte 102,68,15,56,220,193
1137.byte 102,68,15,56,220,201
1138 movups 208-128(%rcx),%xmm1
1139
1140.byte 102,15,56,220,208
1141.byte 102,15,56,220,216
1142.byte 102,15,56,220,224
1143.byte 102,15,56,220,232
1144.byte 102,15,56,220,240
1145.byte 102,15,56,220,248
1146.byte 102,68,15,56,220,192
1147.byte 102,68,15,56,220,200
1148 movups 224-128(%rcx),%xmm0
1149 jmp L$ctr32_enc_done
1150
1151.p2align 4
1152L$ctr32_enc_done:
1153 movdqu 16(%rdi),%xmm11
1154 pxor %xmm0,%xmm10
1155 movdqu 32(%rdi),%xmm12
1156 pxor %xmm0,%xmm11
1157 movdqu 48(%rdi),%xmm13
1158 pxor %xmm0,%xmm12
1159 movdqu 64(%rdi),%xmm14
1160 pxor %xmm0,%xmm13
1161 movdqu 80(%rdi),%xmm15
1162 pxor %xmm0,%xmm14
1163 prefetcht0 448(%rdi)
1164 prefetcht0 512(%rdi)
1165 pxor %xmm0,%xmm15
1166.byte 102,15,56,220,209
1167.byte 102,15,56,220,217
1168.byte 102,15,56,220,225
1169.byte 102,15,56,220,233
1170.byte 102,15,56,220,241
1171.byte 102,15,56,220,249
1172.byte 102,68,15,56,220,193
1173.byte 102,68,15,56,220,201
1174 movdqu 96(%rdi),%xmm1
1175 leaq 128(%rdi),%rdi
1176
1177.byte 102,65,15,56,221,210
1178 pxor %xmm0,%xmm1
1179 movdqu 112-128(%rdi),%xmm10
1180.byte 102,65,15,56,221,219
1181 pxor %xmm0,%xmm10
1182 movdqa 0(%rsp),%xmm11
1183.byte 102,65,15,56,221,228
1184.byte 102,65,15,56,221,237
1185 movdqa 16(%rsp),%xmm12
1186 movdqa 32(%rsp),%xmm13
1187.byte 102,65,15,56,221,246
1188.byte 102,65,15,56,221,255
1189 movdqa 48(%rsp),%xmm14
1190 movdqa 64(%rsp),%xmm15
1191.byte 102,68,15,56,221,193
1192 movdqa 80(%rsp),%xmm0
1193 movups 16-128(%rcx),%xmm1
1194.byte 102,69,15,56,221,202
1195
1196 movups %xmm2,(%rsi)
1197 movdqa %xmm11,%xmm2
1198 movups %xmm3,16(%rsi)
1199 movdqa %xmm12,%xmm3
1200 movups %xmm4,32(%rsi)
1201 movdqa %xmm13,%xmm4
1202 movups %xmm5,48(%rsi)
1203 movdqa %xmm14,%xmm5
1204 movups %xmm6,64(%rsi)
1205 movdqa %xmm15,%xmm6
1206 movups %xmm7,80(%rsi)
1207 movdqa %xmm0,%xmm7
1208 movups %xmm8,96(%rsi)
1209 movups %xmm9,112(%rsi)
1210 leaq 128(%rsi),%rsi
1211
1212 subq $8,%rdx
1213 jnc L$ctr32_loop8
1214
1215 addq $8,%rdx
1216 jz L$ctr32_done
1217 leaq -128(%rcx),%rcx
1218
1219L$ctr32_tail:
1220
1221
1222 leaq 16(%rcx),%rcx
1223 cmpq $4,%rdx
1224 jb L$ctr32_loop3
1225 je L$ctr32_loop4
1226
1227
1228 shll $4,%eax
1229 movdqa 96(%rsp),%xmm8
1230 pxor %xmm9,%xmm9
1231
1232 movups 16(%rcx),%xmm0
1233.byte 102,15,56,220,209
1234.byte 102,15,56,220,217
1235 leaq 32-16(%rcx,%rax,1),%rcx
1236 negq %rax
1237.byte 102,15,56,220,225
1238 addq $16,%rax
1239 movups (%rdi),%xmm10
1240.byte 102,15,56,220,233
1241.byte 102,15,56,220,241
1242 movups 16(%rdi),%xmm11
1243 movups 32(%rdi),%xmm12
1244.byte 102,15,56,220,249
1245.byte 102,68,15,56,220,193
1246
1247 call L$enc_loop8_enter
1248
1249 movdqu 48(%rdi),%xmm13
1250 pxor %xmm10,%xmm2
1251 movdqu 64(%rdi),%xmm10
1252 pxor %xmm11,%xmm3
1253 movdqu %xmm2,(%rsi)
1254 pxor %xmm12,%xmm4
1255 movdqu %xmm3,16(%rsi)
1256 pxor %xmm13,%xmm5
1257 movdqu %xmm4,32(%rsi)
1258 pxor %xmm10,%xmm6
1259 movdqu %xmm5,48(%rsi)
1260 movdqu %xmm6,64(%rsi)
1261 cmpq $6,%rdx
1262 jb L$ctr32_done
1263
1264 movups 80(%rdi),%xmm11
1265 xorps %xmm11,%xmm7
1266 movups %xmm7,80(%rsi)
1267 je L$ctr32_done
1268
1269 movups 96(%rdi),%xmm12
1270 xorps %xmm12,%xmm8
1271 movups %xmm8,96(%rsi)
1272 jmp L$ctr32_done
1273
1274.p2align 5
1275L$ctr32_loop4:
1276.byte 102,15,56,220,209
1277 leaq 16(%rcx),%rcx
1278 decl %eax
1279.byte 102,15,56,220,217
1280.byte 102,15,56,220,225
1281.byte 102,15,56,220,233
1282 movups (%rcx),%xmm1
1283 jnz L$ctr32_loop4
1284.byte 102,15,56,221,209
1285.byte 102,15,56,221,217
1286 movups (%rdi),%xmm10
1287 movups 16(%rdi),%xmm11
1288.byte 102,15,56,221,225
1289.byte 102,15,56,221,233
1290 movups 32(%rdi),%xmm12
1291 movups 48(%rdi),%xmm13
1292
1293 xorps %xmm10,%xmm2
1294 movups %xmm2,(%rsi)
1295 xorps %xmm11,%xmm3
1296 movups %xmm3,16(%rsi)
1297 pxor %xmm12,%xmm4
1298 movdqu %xmm4,32(%rsi)
1299 pxor %xmm13,%xmm5
1300 movdqu %xmm5,48(%rsi)
1301 jmp L$ctr32_done
1302
1303.p2align 5
1304L$ctr32_loop3:
1305.byte 102,15,56,220,209
1306 leaq 16(%rcx),%rcx
1307 decl %eax
1308.byte 102,15,56,220,217
1309.byte 102,15,56,220,225
1310 movups (%rcx),%xmm1
1311 jnz L$ctr32_loop3
1312.byte 102,15,56,221,209
1313.byte 102,15,56,221,217
1314.byte 102,15,56,221,225
1315
1316 movups (%rdi),%xmm10
1317 xorps %xmm10,%xmm2
1318 movups %xmm2,(%rsi)
1319 cmpq $2,%rdx
1320 jb L$ctr32_done
1321
1322 movups 16(%rdi),%xmm11
1323 xorps %xmm11,%xmm3
1324 movups %xmm3,16(%rsi)
1325 je L$ctr32_done
1326
1327 movups 32(%rdi),%xmm12
1328 xorps %xmm12,%xmm4
1329 movups %xmm4,32(%rsi)
1330
1331L$ctr32_done:
1332 xorps %xmm0,%xmm0
1333 xorl %ebp,%ebp
1334 pxor %xmm1,%xmm1
1335 pxor %xmm2,%xmm2
1336 pxor %xmm3,%xmm3
1337 pxor %xmm4,%xmm4
1338 pxor %xmm5,%xmm5
1339 pxor %xmm6,%xmm6
1340 pxor %xmm7,%xmm7
1341 movaps %xmm0,0(%rsp)
1342 pxor %xmm8,%xmm8
1343 movaps %xmm0,16(%rsp)
1344 pxor %xmm9,%xmm9
1345 movaps %xmm0,32(%rsp)
1346 pxor %xmm10,%xmm10
1347 movaps %xmm0,48(%rsp)
1348 pxor %xmm11,%xmm11
1349 movaps %xmm0,64(%rsp)
1350 pxor %xmm12,%xmm12
1351 movaps %xmm0,80(%rsp)
1352 pxor %xmm13,%xmm13
1353 movaps %xmm0,96(%rsp)
1354 pxor %xmm14,%xmm14
1355 movaps %xmm0,112(%rsp)
1356 pxor %xmm15,%xmm15
1357 movq -8(%r11),%rbp
1358
1359 leaq (%r11),%rsp
1360
1361L$ctr32_epilogue:
1362 ret
1363
1364
1365.globl _aes_hw_cbc_encrypt
1366.private_extern _aes_hw_cbc_encrypt
1367
1368.p2align 4
1369_aes_hw_cbc_encrypt:
1370
1371_CET_ENDBR
1372 testq %rdx,%rdx
1373 jz L$cbc_ret
1374
1375 movl 240(%rcx),%r10d
1376 movq %rcx,%r11
1377 testl %r9d,%r9d
1378 jz L$cbc_decrypt
1379
1380 movups (%r8),%xmm2
1381 movl %r10d,%eax
1382 cmpq $16,%rdx
1383 jb L$cbc_enc_tail
1384 subq $16,%rdx
1385 jmp L$cbc_enc_loop
1386.p2align 4
1387L$cbc_enc_loop:
1388 movups (%rdi),%xmm3
1389 leaq 16(%rdi),%rdi
1390
1391 movups (%rcx),%xmm0
1392 movups 16(%rcx),%xmm1
1393 xorps %xmm0,%xmm3
1394 leaq 32(%rcx),%rcx
1395 xorps %xmm3,%xmm2
1396L$oop_enc1_6:
1397.byte 102,15,56,220,209
1398 decl %eax
1399 movups (%rcx),%xmm1
1400 leaq 16(%rcx),%rcx
1401 jnz L$oop_enc1_6
1402.byte 102,15,56,221,209
1403 movl %r10d,%eax
1404 movq %r11,%rcx
1405 movups %xmm2,0(%rsi)
1406 leaq 16(%rsi),%rsi
1407 subq $16,%rdx
1408 jnc L$cbc_enc_loop
1409 addq $16,%rdx
1410 jnz L$cbc_enc_tail
1411 pxor %xmm0,%xmm0
1412 pxor %xmm1,%xmm1
1413 movups %xmm2,(%r8)
1414 pxor %xmm2,%xmm2
1415 pxor %xmm3,%xmm3
1416 jmp L$cbc_ret
1417
1418L$cbc_enc_tail:
1419 movq %rdx,%rcx
1420 xchgq %rdi,%rsi
1421.long 0x9066A4F3
1422 movl $16,%ecx
1423 subq %rdx,%rcx
1424 xorl %eax,%eax
1425.long 0x9066AAF3
1426 leaq -16(%rdi),%rdi
1427 movl %r10d,%eax
1428 movq %rdi,%rsi
1429 movq %r11,%rcx
1430 xorq %rdx,%rdx
1431 jmp L$cbc_enc_loop
1432
1433.p2align 4
1434L$cbc_decrypt:
1435 cmpq $16,%rdx
1436 jne L$cbc_decrypt_bulk
1437
1438
1439
1440 movdqu (%rdi),%xmm2
1441 movdqu (%r8),%xmm3
1442 movdqa %xmm2,%xmm4
1443 movups (%rcx),%xmm0
1444 movups 16(%rcx),%xmm1
1445 leaq 32(%rcx),%rcx
1446 xorps %xmm0,%xmm2
1447L$oop_dec1_7:
1448.byte 102,15,56,222,209
1449 decl %r10d
1450 movups (%rcx),%xmm1
1451 leaq 16(%rcx),%rcx
1452 jnz L$oop_dec1_7
1453.byte 102,15,56,223,209
1454 pxor %xmm0,%xmm0
1455 pxor %xmm1,%xmm1
1456 movdqu %xmm4,(%r8)
1457 xorps %xmm3,%xmm2
1458 pxor %xmm3,%xmm3
1459 movups %xmm2,(%rsi)
1460 pxor %xmm2,%xmm2
1461 jmp L$cbc_ret
1462.p2align 4
1463L$cbc_decrypt_bulk:
1464 leaq (%rsp),%r11
1465
1466 pushq %rbp
1467
1468 subq $16,%rsp
1469 andq $-16,%rsp
1470 movq %rcx,%rbp
1471 movups (%r8),%xmm10
1472 movl %r10d,%eax
1473 cmpq $0x50,%rdx
1474 jbe L$cbc_dec_tail
1475
1476 movups (%rcx),%xmm0
1477 movdqu 0(%rdi),%xmm2
1478 movdqu 16(%rdi),%xmm3
1479 movdqa %xmm2,%xmm11
1480 movdqu 32(%rdi),%xmm4
1481 movdqa %xmm3,%xmm12
1482 movdqu 48(%rdi),%xmm5
1483 movdqa %xmm4,%xmm13
1484 movdqu 64(%rdi),%xmm6
1485 movdqa %xmm5,%xmm14
1486 movdqu 80(%rdi),%xmm7
1487 movdqa %xmm6,%xmm15
1488 cmpq $0x70,%rdx
1489 jbe L$cbc_dec_six_or_seven
1490
1491 subq $0x70,%rdx
1492 leaq 112(%rcx),%rcx
1493 jmp L$cbc_dec_loop8_enter
1494.p2align 4
1495L$cbc_dec_loop8:
1496 movups %xmm9,(%rsi)
1497 leaq 16(%rsi),%rsi
1498L$cbc_dec_loop8_enter:
1499 movdqu 96(%rdi),%xmm8
1500 pxor %xmm0,%xmm2
1501 movdqu 112(%rdi),%xmm9
1502 pxor %xmm0,%xmm3
1503 movups 16-112(%rcx),%xmm1
1504 pxor %xmm0,%xmm4
1505 movq $-1,%rbp
1506 cmpq $0x70,%rdx
1507 pxor %xmm0,%xmm5
1508 pxor %xmm0,%xmm6
1509 pxor %xmm0,%xmm7
1510 pxor %xmm0,%xmm8
1511
1512.byte 102,15,56,222,209
1513 pxor %xmm0,%xmm9
1514 movups 32-112(%rcx),%xmm0
1515.byte 102,15,56,222,217
1516.byte 102,15,56,222,225
1517.byte 102,15,56,222,233
1518.byte 102,15,56,222,241
1519.byte 102,15,56,222,249
1520.byte 102,68,15,56,222,193
1521 adcq $0,%rbp
1522 andq $128,%rbp
1523.byte 102,68,15,56,222,201
1524 addq %rdi,%rbp
1525 movups 48-112(%rcx),%xmm1
1526.byte 102,15,56,222,208
1527.byte 102,15,56,222,216
1528.byte 102,15,56,222,224
1529.byte 102,15,56,222,232
1530.byte 102,15,56,222,240
1531.byte 102,15,56,222,248
1532.byte 102,68,15,56,222,192
1533.byte 102,68,15,56,222,200
1534 movups 64-112(%rcx),%xmm0
1535 nop
1536.byte 102,15,56,222,209
1537.byte 102,15,56,222,217
1538.byte 102,15,56,222,225
1539.byte 102,15,56,222,233
1540.byte 102,15,56,222,241
1541.byte 102,15,56,222,249
1542.byte 102,68,15,56,222,193
1543.byte 102,68,15,56,222,201
1544 movups 80-112(%rcx),%xmm1
1545 nop
1546.byte 102,15,56,222,208
1547.byte 102,15,56,222,216
1548.byte 102,15,56,222,224
1549.byte 102,15,56,222,232
1550.byte 102,15,56,222,240
1551.byte 102,15,56,222,248
1552.byte 102,68,15,56,222,192
1553.byte 102,68,15,56,222,200
1554 movups 96-112(%rcx),%xmm0
1555 nop
1556.byte 102,15,56,222,209
1557.byte 102,15,56,222,217
1558.byte 102,15,56,222,225
1559.byte 102,15,56,222,233
1560.byte 102,15,56,222,241
1561.byte 102,15,56,222,249
1562.byte 102,68,15,56,222,193
1563.byte 102,68,15,56,222,201
1564 movups 112-112(%rcx),%xmm1
1565 nop
1566.byte 102,15,56,222,208
1567.byte 102,15,56,222,216
1568.byte 102,15,56,222,224
1569.byte 102,15,56,222,232
1570.byte 102,15,56,222,240
1571.byte 102,15,56,222,248
1572.byte 102,68,15,56,222,192
1573.byte 102,68,15,56,222,200
1574 movups 128-112(%rcx),%xmm0
1575 nop
1576.byte 102,15,56,222,209
1577.byte 102,15,56,222,217
1578.byte 102,15,56,222,225
1579.byte 102,15,56,222,233
1580.byte 102,15,56,222,241
1581.byte 102,15,56,222,249
1582.byte 102,68,15,56,222,193
1583.byte 102,68,15,56,222,201
1584 movups 144-112(%rcx),%xmm1
1585 cmpl $11,%eax
1586.byte 102,15,56,222,208
1587.byte 102,15,56,222,216
1588.byte 102,15,56,222,224
1589.byte 102,15,56,222,232
1590.byte 102,15,56,222,240
1591.byte 102,15,56,222,248
1592.byte 102,68,15,56,222,192
1593.byte 102,68,15,56,222,200
1594 movups 160-112(%rcx),%xmm0
1595 jb L$cbc_dec_done
1596.byte 102,15,56,222,209
1597.byte 102,15,56,222,217
1598.byte 102,15,56,222,225
1599.byte 102,15,56,222,233
1600.byte 102,15,56,222,241
1601.byte 102,15,56,222,249
1602.byte 102,68,15,56,222,193
1603.byte 102,68,15,56,222,201
1604 movups 176-112(%rcx),%xmm1
1605 nop
1606.byte 102,15,56,222,208
1607.byte 102,15,56,222,216
1608.byte 102,15,56,222,224
1609.byte 102,15,56,222,232
1610.byte 102,15,56,222,240
1611.byte 102,15,56,222,248
1612.byte 102,68,15,56,222,192
1613.byte 102,68,15,56,222,200
1614 movups 192-112(%rcx),%xmm0
1615 je L$cbc_dec_done
1616.byte 102,15,56,222,209
1617.byte 102,15,56,222,217
1618.byte 102,15,56,222,225
1619.byte 102,15,56,222,233
1620.byte 102,15,56,222,241
1621.byte 102,15,56,222,249
1622.byte 102,68,15,56,222,193
1623.byte 102,68,15,56,222,201
1624 movups 208-112(%rcx),%xmm1
1625 nop
1626.byte 102,15,56,222,208
1627.byte 102,15,56,222,216
1628.byte 102,15,56,222,224
1629.byte 102,15,56,222,232
1630.byte 102,15,56,222,240
1631.byte 102,15,56,222,248
1632.byte 102,68,15,56,222,192
1633.byte 102,68,15,56,222,200
1634 movups 224-112(%rcx),%xmm0
1635 jmp L$cbc_dec_done
1636.p2align 4
1637L$cbc_dec_done:
1638.byte 102,15,56,222,209
1639.byte 102,15,56,222,217
1640 pxor %xmm0,%xmm10
1641 pxor %xmm0,%xmm11
1642.byte 102,15,56,222,225
1643.byte 102,15,56,222,233
1644 pxor %xmm0,%xmm12
1645 pxor %xmm0,%xmm13
1646.byte 102,15,56,222,241
1647.byte 102,15,56,222,249
1648 pxor %xmm0,%xmm14
1649 pxor %xmm0,%xmm15
1650.byte 102,68,15,56,222,193
1651.byte 102,68,15,56,222,201
1652 movdqu 80(%rdi),%xmm1
1653
1654.byte 102,65,15,56,223,210
1655 movdqu 96(%rdi),%xmm10
1656 pxor %xmm0,%xmm1
1657.byte 102,65,15,56,223,219
1658 pxor %xmm0,%xmm10
1659 movdqu 112(%rdi),%xmm0
1660.byte 102,65,15,56,223,228
1661 leaq 128(%rdi),%rdi
1662 movdqu 0(%rbp),%xmm11
1663.byte 102,65,15,56,223,237
1664.byte 102,65,15,56,223,246
1665 movdqu 16(%rbp),%xmm12
1666 movdqu 32(%rbp),%xmm13
1667.byte 102,65,15,56,223,255
1668.byte 102,68,15,56,223,193
1669 movdqu 48(%rbp),%xmm14
1670 movdqu 64(%rbp),%xmm15
1671.byte 102,69,15,56,223,202
1672 movdqa %xmm0,%xmm10
1673 movdqu 80(%rbp),%xmm1
1674 movups -112(%rcx),%xmm0
1675
1676 movups %xmm2,(%rsi)
1677 movdqa %xmm11,%xmm2
1678 movups %xmm3,16(%rsi)
1679 movdqa %xmm12,%xmm3
1680 movups %xmm4,32(%rsi)
1681 movdqa %xmm13,%xmm4
1682 movups %xmm5,48(%rsi)
1683 movdqa %xmm14,%xmm5
1684 movups %xmm6,64(%rsi)
1685 movdqa %xmm15,%xmm6
1686 movups %xmm7,80(%rsi)
1687 movdqa %xmm1,%xmm7
1688 movups %xmm8,96(%rsi)
1689 leaq 112(%rsi),%rsi
1690
1691 subq $0x80,%rdx
1692 ja L$cbc_dec_loop8
1693
1694 movaps %xmm9,%xmm2
1695 leaq -112(%rcx),%rcx
1696 addq $0x70,%rdx
1697 jle L$cbc_dec_clear_tail_collected
1698 movups %xmm9,(%rsi)
1699 leaq 16(%rsi),%rsi
1700 cmpq $0x50,%rdx
1701 jbe L$cbc_dec_tail
1702
1703 movaps %xmm11,%xmm2
1704L$cbc_dec_six_or_seven:
1705 cmpq $0x60,%rdx
1706 ja L$cbc_dec_seven
1707
1708 movaps %xmm7,%xmm8
1709 call _aesni_decrypt6
1710 pxor %xmm10,%xmm2
1711 movaps %xmm8,%xmm10
1712 pxor %xmm11,%xmm3
1713 movdqu %xmm2,(%rsi)
1714 pxor %xmm12,%xmm4
1715 movdqu %xmm3,16(%rsi)
1716 pxor %xmm3,%xmm3
1717 pxor %xmm13,%xmm5
1718 movdqu %xmm4,32(%rsi)
1719 pxor %xmm4,%xmm4
1720 pxor %xmm14,%xmm6
1721 movdqu %xmm5,48(%rsi)
1722 pxor %xmm5,%xmm5
1723 pxor %xmm15,%xmm7
1724 movdqu %xmm6,64(%rsi)
1725 pxor %xmm6,%xmm6
1726 leaq 80(%rsi),%rsi
1727 movdqa %xmm7,%xmm2
1728 pxor %xmm7,%xmm7
1729 jmp L$cbc_dec_tail_collected
1730
1731.p2align 4
1732L$cbc_dec_seven:
1733 movups 96(%rdi),%xmm8
1734 xorps %xmm9,%xmm9
1735 call _aesni_decrypt8
1736 movups 80(%rdi),%xmm9
1737 pxor %xmm10,%xmm2
1738 movups 96(%rdi),%xmm10
1739 pxor %xmm11,%xmm3
1740 movdqu %xmm2,(%rsi)
1741 pxor %xmm12,%xmm4
1742 movdqu %xmm3,16(%rsi)
1743 pxor %xmm3,%xmm3
1744 pxor %xmm13,%xmm5
1745 movdqu %xmm4,32(%rsi)
1746 pxor %xmm4,%xmm4
1747 pxor %xmm14,%xmm6
1748 movdqu %xmm5,48(%rsi)
1749 pxor %xmm5,%xmm5
1750 pxor %xmm15,%xmm7
1751 movdqu %xmm6,64(%rsi)
1752 pxor %xmm6,%xmm6
1753 pxor %xmm9,%xmm8
1754 movdqu %xmm7,80(%rsi)
1755 pxor %xmm7,%xmm7
1756 leaq 96(%rsi),%rsi
1757 movdqa %xmm8,%xmm2
1758 pxor %xmm8,%xmm8
1759 pxor %xmm9,%xmm9
1760 jmp L$cbc_dec_tail_collected
1761
1762L$cbc_dec_tail:
1763 movups (%rdi),%xmm2
1764 subq $0x10,%rdx
1765 jbe L$cbc_dec_one
1766
1767 movups 16(%rdi),%xmm3
1768 movaps %xmm2,%xmm11
1769 subq $0x10,%rdx
1770 jbe L$cbc_dec_two
1771
1772 movups 32(%rdi),%xmm4
1773 movaps %xmm3,%xmm12
1774 subq $0x10,%rdx
1775 jbe L$cbc_dec_three
1776
1777 movups 48(%rdi),%xmm5
1778 movaps %xmm4,%xmm13
1779 subq $0x10,%rdx
1780 jbe L$cbc_dec_four
1781
1782 movups 64(%rdi),%xmm6
1783 movaps %xmm5,%xmm14
1784 movaps %xmm6,%xmm15
1785 xorps %xmm7,%xmm7
1786 call _aesni_decrypt6
1787 pxor %xmm10,%xmm2
1788 movaps %xmm15,%xmm10
1789 pxor %xmm11,%xmm3
1790 movdqu %xmm2,(%rsi)
1791 pxor %xmm12,%xmm4
1792 movdqu %xmm3,16(%rsi)
1793 pxor %xmm3,%xmm3
1794 pxor %xmm13,%xmm5
1795 movdqu %xmm4,32(%rsi)
1796 pxor %xmm4,%xmm4
1797 pxor %xmm14,%xmm6
1798 movdqu %xmm5,48(%rsi)
1799 pxor %xmm5,%xmm5
1800 leaq 64(%rsi),%rsi
1801 movdqa %xmm6,%xmm2
1802 pxor %xmm6,%xmm6
1803 pxor %xmm7,%xmm7
1804 subq $0x10,%rdx
1805 jmp L$cbc_dec_tail_collected
1806
1807.p2align 4
1808L$cbc_dec_one:
1809 movaps %xmm2,%xmm11
1810 movups (%rcx),%xmm0
1811 movups 16(%rcx),%xmm1
1812 leaq 32(%rcx),%rcx
1813 xorps %xmm0,%xmm2
1814L$oop_dec1_8:
1815.byte 102,15,56,222,209
1816 decl %eax
1817 movups (%rcx),%xmm1
1818 leaq 16(%rcx),%rcx
1819 jnz L$oop_dec1_8
1820.byte 102,15,56,223,209
1821 xorps %xmm10,%xmm2
1822 movaps %xmm11,%xmm10
1823 jmp L$cbc_dec_tail_collected
1824.p2align 4
1825L$cbc_dec_two:
1826 movaps %xmm3,%xmm12
1827 call _aesni_decrypt2
1828 pxor %xmm10,%xmm2
1829 movaps %xmm12,%xmm10
1830 pxor %xmm11,%xmm3
1831 movdqu %xmm2,(%rsi)
1832 movdqa %xmm3,%xmm2
1833 pxor %xmm3,%xmm3
1834 leaq 16(%rsi),%rsi
1835 jmp L$cbc_dec_tail_collected
1836.p2align 4
1837L$cbc_dec_three:
1838 movaps %xmm4,%xmm13
1839 call _aesni_decrypt3
1840 pxor %xmm10,%xmm2
1841 movaps %xmm13,%xmm10
1842 pxor %xmm11,%xmm3
1843 movdqu %xmm2,(%rsi)
1844 pxor %xmm12,%xmm4
1845 movdqu %xmm3,16(%rsi)
1846 pxor %xmm3,%xmm3
1847 movdqa %xmm4,%xmm2
1848 pxor %xmm4,%xmm4
1849 leaq 32(%rsi),%rsi
1850 jmp L$cbc_dec_tail_collected
1851.p2align 4
1852L$cbc_dec_four:
1853 movaps %xmm5,%xmm14
1854 call _aesni_decrypt4
1855 pxor %xmm10,%xmm2
1856 movaps %xmm14,%xmm10
1857 pxor %xmm11,%xmm3
1858 movdqu %xmm2,(%rsi)
1859 pxor %xmm12,%xmm4
1860 movdqu %xmm3,16(%rsi)
1861 pxor %xmm3,%xmm3
1862 pxor %xmm13,%xmm5
1863 movdqu %xmm4,32(%rsi)
1864 pxor %xmm4,%xmm4
1865 movdqa %xmm5,%xmm2
1866 pxor %xmm5,%xmm5
1867 leaq 48(%rsi),%rsi
1868 jmp L$cbc_dec_tail_collected
1869
1870.p2align 4
1871L$cbc_dec_clear_tail_collected:
1872 pxor %xmm3,%xmm3
1873 pxor %xmm4,%xmm4
1874 pxor %xmm5,%xmm5
1875 pxor %xmm6,%xmm6
1876 pxor %xmm7,%xmm7
1877 pxor %xmm8,%xmm8
1878 pxor %xmm9,%xmm9
1879L$cbc_dec_tail_collected:
1880 movups %xmm10,(%r8)
1881 andq $15,%rdx
1882 jnz L$cbc_dec_tail_partial
1883 movups %xmm2,(%rsi)
1884 pxor %xmm2,%xmm2
1885 jmp L$cbc_dec_ret
1886.p2align 4
1887L$cbc_dec_tail_partial:
1888 movaps %xmm2,(%rsp)
1889 pxor %xmm2,%xmm2
1890 movq $16,%rcx
1891 movq %rsi,%rdi
1892 subq %rdx,%rcx
1893 leaq (%rsp),%rsi
1894.long 0x9066A4F3
1895 movdqa %xmm2,(%rsp)
1896
1897L$cbc_dec_ret:
1898 xorps %xmm0,%xmm0
1899 pxor %xmm1,%xmm1
1900 movq -8(%r11),%rbp
1901
1902 leaq (%r11),%rsp
1903
1904L$cbc_ret:
1905 ret
1906
1907
1908.globl _aes_hw_set_decrypt_key
1909.private_extern _aes_hw_set_decrypt_key
1910
1911.p2align 4
1912_aes_hw_set_decrypt_key:
1913
David Benjaminbfcab2a2024-05-12 10:26:48 -04001914
David Benjaminfe0c91e2024-03-18 15:37:24 +10001915_CET_ENDBR
David Benjaminbfcab2a2024-05-12 10:26:48 -04001916 subq $8,%rsp
1917
1918
David Benjaminfe0c91e2024-03-18 15:37:24 +10001919
1920 call __aesni_set_encrypt_key
1921 shll $4,%esi
1922 testl %eax,%eax
1923 jnz L$dec_key_ret
1924 leaq 16(%rdx,%rsi,1),%rdi
1925
1926 movups (%rdx),%xmm0
1927 movups (%rdi),%xmm1
1928 movups %xmm0,(%rdi)
1929 movups %xmm1,(%rdx)
1930 leaq 16(%rdx),%rdx
1931 leaq -16(%rdi),%rdi
1932
1933L$dec_key_inverse:
1934 movups (%rdx),%xmm0
1935 movups (%rdi),%xmm1
1936.byte 102,15,56,219,192
1937.byte 102,15,56,219,201
1938 leaq 16(%rdx),%rdx
1939 leaq -16(%rdi),%rdi
1940 movups %xmm0,16(%rdi)
1941 movups %xmm1,-16(%rdx)
1942 cmpq %rdx,%rdi
1943 ja L$dec_key_inverse
1944
1945 movups (%rdx),%xmm0
1946.byte 102,15,56,219,192
1947 pxor %xmm1,%xmm1
1948 movups %xmm0,(%rdi)
1949 pxor %xmm0,%xmm0
1950L$dec_key_ret:
1951 addq $8,%rsp
1952
1953 ret
1954
David Benjaminbfcab2a2024-05-12 10:26:48 -04001955
David Benjaminfe0c91e2024-03-18 15:37:24 +10001956
1957.globl _aes_hw_set_encrypt_key
1958.private_extern _aes_hw_set_encrypt_key
1959
1960.p2align 4
1961_aes_hw_set_encrypt_key:
1962__aesni_set_encrypt_key:
1963
David Benjaminbfcab2a2024-05-12 10:26:48 -04001964
David Benjaminfe0c91e2024-03-18 15:37:24 +10001965_CET_ENDBR
1966#ifdef BORINGSSL_DISPATCH_TEST
1967 movb $1,_BORINGSSL_function_hit+3(%rip)
1968#endif
David Benjaminbfcab2a2024-05-12 10:26:48 -04001969 subq $8,%rsp
1970
1971
David Benjaminfe0c91e2024-03-18 15:37:24 +10001972
David Benjaminfe0c91e2024-03-18 15:37:24 +10001973 movups (%rdi),%xmm0
1974 xorps %xmm4,%xmm4
1975 leaq _OPENSSL_ia32cap_P(%rip),%r10
1976 movl 4(%r10),%r10d
1977 andl $268437504,%r10d
1978 leaq 16(%rdx),%rax
1979 cmpl $256,%esi
1980 je L$14rounds
1981 cmpl $192,%esi
1982 je L$12rounds
1983 cmpl $128,%esi
1984 jne L$bad_keybits
1985
1986L$10rounds:
1987 movl $9,%esi
1988 cmpl $268435456,%r10d
1989 je L$10rounds_alt
1990
1991 movups %xmm0,(%rdx)
1992.byte 102,15,58,223,200,1
1993 call L$key_expansion_128_cold
1994.byte 102,15,58,223,200,2
1995 call L$key_expansion_128
1996.byte 102,15,58,223,200,4
1997 call L$key_expansion_128
1998.byte 102,15,58,223,200,8
1999 call L$key_expansion_128
2000.byte 102,15,58,223,200,16
2001 call L$key_expansion_128
2002.byte 102,15,58,223,200,32
2003 call L$key_expansion_128
2004.byte 102,15,58,223,200,64
2005 call L$key_expansion_128
2006.byte 102,15,58,223,200,128
2007 call L$key_expansion_128
2008.byte 102,15,58,223,200,27
2009 call L$key_expansion_128
2010.byte 102,15,58,223,200,54
2011 call L$key_expansion_128
2012 movups %xmm0,(%rax)
2013 movl %esi,80(%rax)
2014 xorl %eax,%eax
2015 jmp L$enc_key_ret
2016
2017.p2align 4
2018L$10rounds_alt:
2019 movdqa L$key_rotate(%rip),%xmm5
2020 movl $8,%r10d
2021 movdqa L$key_rcon1(%rip),%xmm4
2022 movdqa %xmm0,%xmm2
2023 movdqu %xmm0,(%rdx)
2024 jmp L$oop_key128
2025
2026.p2align 4
2027L$oop_key128:
2028.byte 102,15,56,0,197
2029.byte 102,15,56,221,196
2030 pslld $1,%xmm4
2031 leaq 16(%rax),%rax
2032
2033 movdqa %xmm2,%xmm3
2034 pslldq $4,%xmm2
2035 pxor %xmm2,%xmm3
2036 pslldq $4,%xmm2
2037 pxor %xmm2,%xmm3
2038 pslldq $4,%xmm2
2039 pxor %xmm3,%xmm2
2040
2041 pxor %xmm2,%xmm0
2042 movdqu %xmm0,-16(%rax)
2043 movdqa %xmm0,%xmm2
2044
2045 decl %r10d
2046 jnz L$oop_key128
2047
2048 movdqa L$key_rcon1b(%rip),%xmm4
2049
2050.byte 102,15,56,0,197
2051.byte 102,15,56,221,196
2052 pslld $1,%xmm4
2053
2054 movdqa %xmm2,%xmm3
2055 pslldq $4,%xmm2
2056 pxor %xmm2,%xmm3
2057 pslldq $4,%xmm2
2058 pxor %xmm2,%xmm3
2059 pslldq $4,%xmm2
2060 pxor %xmm3,%xmm2
2061
2062 pxor %xmm2,%xmm0
2063 movdqu %xmm0,(%rax)
2064
2065 movdqa %xmm0,%xmm2
2066.byte 102,15,56,0,197
2067.byte 102,15,56,221,196
2068
2069 movdqa %xmm2,%xmm3
2070 pslldq $4,%xmm2
2071 pxor %xmm2,%xmm3
2072 pslldq $4,%xmm2
2073 pxor %xmm2,%xmm3
2074 pslldq $4,%xmm2
2075 pxor %xmm3,%xmm2
2076
2077 pxor %xmm2,%xmm0
2078 movdqu %xmm0,16(%rax)
2079
2080 movl %esi,96(%rax)
2081 xorl %eax,%eax
2082 jmp L$enc_key_ret
2083
2084.p2align 4
2085L$12rounds:
2086 movq 16(%rdi),%xmm2
2087 movl $11,%esi
2088 cmpl $268435456,%r10d
2089 je L$12rounds_alt
2090
2091 movups %xmm0,(%rdx)
2092.byte 102,15,58,223,202,1
2093 call L$key_expansion_192a_cold
2094.byte 102,15,58,223,202,2
2095 call L$key_expansion_192b
2096.byte 102,15,58,223,202,4
2097 call L$key_expansion_192a
2098.byte 102,15,58,223,202,8
2099 call L$key_expansion_192b
2100.byte 102,15,58,223,202,16
2101 call L$key_expansion_192a
2102.byte 102,15,58,223,202,32
2103 call L$key_expansion_192b
2104.byte 102,15,58,223,202,64
2105 call L$key_expansion_192a
2106.byte 102,15,58,223,202,128
2107 call L$key_expansion_192b
2108 movups %xmm0,(%rax)
2109 movl %esi,48(%rax)
2110 xorq %rax,%rax
2111 jmp L$enc_key_ret
2112
2113.p2align 4
2114L$12rounds_alt:
2115 movdqa L$key_rotate192(%rip),%xmm5
2116 movdqa L$key_rcon1(%rip),%xmm4
2117 movl $8,%r10d
2118 movdqu %xmm0,(%rdx)
2119 jmp L$oop_key192
2120
2121.p2align 4
2122L$oop_key192:
2123 movq %xmm2,0(%rax)
2124 movdqa %xmm2,%xmm1
2125.byte 102,15,56,0,213
2126.byte 102,15,56,221,212
2127 pslld $1,%xmm4
2128 leaq 24(%rax),%rax
2129
2130 movdqa %xmm0,%xmm3
2131 pslldq $4,%xmm0
2132 pxor %xmm0,%xmm3
2133 pslldq $4,%xmm0
2134 pxor %xmm0,%xmm3
2135 pslldq $4,%xmm0
2136 pxor %xmm3,%xmm0
2137
2138 pshufd $0xff,%xmm0,%xmm3
2139 pxor %xmm1,%xmm3
2140 pslldq $4,%xmm1
2141 pxor %xmm1,%xmm3
2142
2143 pxor %xmm2,%xmm0
2144 pxor %xmm3,%xmm2
2145 movdqu %xmm0,-16(%rax)
2146
2147 decl %r10d
2148 jnz L$oop_key192
2149
2150 movl %esi,32(%rax)
2151 xorl %eax,%eax
2152 jmp L$enc_key_ret
2153
2154.p2align 4
2155L$14rounds:
2156 movups 16(%rdi),%xmm2
2157 movl $13,%esi
2158 leaq 16(%rax),%rax
2159 cmpl $268435456,%r10d
2160 je L$14rounds_alt
2161
2162 movups %xmm0,(%rdx)
2163 movups %xmm2,16(%rdx)
2164.byte 102,15,58,223,202,1
2165 call L$key_expansion_256a_cold
2166.byte 102,15,58,223,200,1
2167 call L$key_expansion_256b
2168.byte 102,15,58,223,202,2
2169 call L$key_expansion_256a
2170.byte 102,15,58,223,200,2
2171 call L$key_expansion_256b
2172.byte 102,15,58,223,202,4
2173 call L$key_expansion_256a
2174.byte 102,15,58,223,200,4
2175 call L$key_expansion_256b
2176.byte 102,15,58,223,202,8
2177 call L$key_expansion_256a
2178.byte 102,15,58,223,200,8
2179 call L$key_expansion_256b
2180.byte 102,15,58,223,202,16
2181 call L$key_expansion_256a
2182.byte 102,15,58,223,200,16
2183 call L$key_expansion_256b
2184.byte 102,15,58,223,202,32
2185 call L$key_expansion_256a
2186.byte 102,15,58,223,200,32
2187 call L$key_expansion_256b
2188.byte 102,15,58,223,202,64
2189 call L$key_expansion_256a
2190 movups %xmm0,(%rax)
2191 movl %esi,16(%rax)
2192 xorq %rax,%rax
2193 jmp L$enc_key_ret
2194
2195.p2align 4
2196L$14rounds_alt:
2197 movdqa L$key_rotate(%rip),%xmm5
2198 movdqa L$key_rcon1(%rip),%xmm4
2199 movl $7,%r10d
2200 movdqu %xmm0,0(%rdx)
2201 movdqa %xmm2,%xmm1
2202 movdqu %xmm2,16(%rdx)
2203 jmp L$oop_key256
2204
2205.p2align 4
2206L$oop_key256:
2207.byte 102,15,56,0,213
2208.byte 102,15,56,221,212
2209
2210 movdqa %xmm0,%xmm3
2211 pslldq $4,%xmm0
2212 pxor %xmm0,%xmm3
2213 pslldq $4,%xmm0
2214 pxor %xmm0,%xmm3
2215 pslldq $4,%xmm0
2216 pxor %xmm3,%xmm0
2217 pslld $1,%xmm4
2218
2219 pxor %xmm2,%xmm0
2220 movdqu %xmm0,(%rax)
2221
2222 decl %r10d
2223 jz L$done_key256
2224
2225 pshufd $0xff,%xmm0,%xmm2
2226 pxor %xmm3,%xmm3
2227.byte 102,15,56,221,211
2228
2229 movdqa %xmm1,%xmm3
2230 pslldq $4,%xmm1
2231 pxor %xmm1,%xmm3
2232 pslldq $4,%xmm1
2233 pxor %xmm1,%xmm3
2234 pslldq $4,%xmm1
2235 pxor %xmm3,%xmm1
2236
2237 pxor %xmm1,%xmm2
2238 movdqu %xmm2,16(%rax)
2239 leaq 32(%rax),%rax
2240 movdqa %xmm2,%xmm1
2241
2242 jmp L$oop_key256
2243
2244L$done_key256:
2245 movl %esi,16(%rax)
2246 xorl %eax,%eax
2247 jmp L$enc_key_ret
2248
2249.p2align 4
2250L$bad_keybits:
2251 movq $-2,%rax
2252L$enc_key_ret:
2253 pxor %xmm0,%xmm0
2254 pxor %xmm1,%xmm1
2255 pxor %xmm2,%xmm2
2256 pxor %xmm3,%xmm3
2257 pxor %xmm4,%xmm4
2258 pxor %xmm5,%xmm5
2259 addq $8,%rsp
2260
2261 ret
2262
David Benjaminbfcab2a2024-05-12 10:26:48 -04002263
David Benjaminfe0c91e2024-03-18 15:37:24 +10002264
2265.p2align 4
2266L$key_expansion_128:
2267 movups %xmm0,(%rax)
2268 leaq 16(%rax),%rax
2269L$key_expansion_128_cold:
2270 shufps $16,%xmm0,%xmm4
2271 xorps %xmm4,%xmm0
2272 shufps $140,%xmm0,%xmm4
2273 xorps %xmm4,%xmm0
2274 shufps $255,%xmm1,%xmm1
2275 xorps %xmm1,%xmm0
2276 ret
2277
2278.p2align 4
2279L$key_expansion_192a:
2280 movups %xmm0,(%rax)
2281 leaq 16(%rax),%rax
2282L$key_expansion_192a_cold:
2283 movaps %xmm2,%xmm5
2284L$key_expansion_192b_warm:
2285 shufps $16,%xmm0,%xmm4
2286 movdqa %xmm2,%xmm3
2287 xorps %xmm4,%xmm0
2288 shufps $140,%xmm0,%xmm4
2289 pslldq $4,%xmm3
2290 xorps %xmm4,%xmm0
2291 pshufd $85,%xmm1,%xmm1
2292 pxor %xmm3,%xmm2
2293 pxor %xmm1,%xmm0
2294 pshufd $255,%xmm0,%xmm3
2295 pxor %xmm3,%xmm2
2296 ret
2297
2298.p2align 4
2299L$key_expansion_192b:
2300 movaps %xmm0,%xmm3
2301 shufps $68,%xmm0,%xmm5
2302 movups %xmm5,(%rax)
2303 shufps $78,%xmm2,%xmm3
2304 movups %xmm3,16(%rax)
2305 leaq 32(%rax),%rax
2306 jmp L$key_expansion_192b_warm
2307
2308.p2align 4
2309L$key_expansion_256a:
2310 movups %xmm2,(%rax)
2311 leaq 16(%rax),%rax
2312L$key_expansion_256a_cold:
2313 shufps $16,%xmm0,%xmm4
2314 xorps %xmm4,%xmm0
2315 shufps $140,%xmm0,%xmm4
2316 xorps %xmm4,%xmm0
2317 shufps $255,%xmm1,%xmm1
2318 xorps %xmm1,%xmm0
2319 ret
2320
2321.p2align 4
2322L$key_expansion_256b:
2323 movups %xmm0,(%rax)
2324 leaq 16(%rax),%rax
2325
2326 shufps $16,%xmm2,%xmm4
2327 xorps %xmm4,%xmm2
2328 shufps $140,%xmm2,%xmm4
2329 xorps %xmm4,%xmm2
2330 shufps $170,%xmm1,%xmm1
2331 xorps %xmm1,%xmm2
2332 ret
2333
2334
2335.section __DATA,__const
2336.p2align 6
2337L$bswap_mask:
2338.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
2339L$increment32:
2340.long 6,6,6,0
2341L$increment64:
2342.long 1,0,0,0
2343L$xts_magic:
2344.long 0x87,0,1,0
2345L$increment1:
2346.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2347L$key_rotate:
2348.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
2349L$key_rotate192:
2350.long 0x04070605,0x04070605,0x04070605,0x04070605
2351L$key_rcon1:
2352.long 1,1,1,1
2353L$key_rcon1b:
2354.long 0x1b,0x1b,0x1b,0x1b
2355
2356.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
2357.p2align 6
2358.text
2359#endif