Stop manually encoding various x86 extensions from perlasm Confirmed with objdump -d that object files remain unchanged. Bug: 478924351 Change-Id: I590f2a31ef32a79a5a06414a3a0675ecdb4cc9b3 Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/87907 Reviewed-by: Lily Chen <chlily@google.com> Auto-Submit: David Benjamin <davidben@google.com> Presubmit-BoringSSL-Verified: boringssl-scoped@luci-project-accounts.iam.gserviceaccount.com <boringssl-scoped@luci-project-accounts.iam.gserviceaccount.com> Commit-Queue: Lily Chen <chlily@google.com>
diff --git a/crypto/perlasm/x86asm.pl b/crypto/perlasm/x86asm.pl index acb1e6f..80a097d 100644 --- a/crypto/perlasm/x86asm.pl +++ b/crypto/perlasm/x86asm.pl
@@ -114,65 +114,6 @@ { &::generic("movq",@_); } } -# SSE>2 instructions -my %regrm = ( "eax"=>0, "ecx"=>1, "edx"=>2, "ebx"=>3, - "esp"=>4, "ebp"=>5, "esi"=>6, "edi"=>7 ); -sub ::pextrd -{ my($dst,$src,$imm)=@_; - if ("$dst:$src" =~ /(e[a-dsd][ixp]):xmm([0-7])/) - { &::data_byte(0x66,0x0f,0x3a,0x16,0xc0|($2<<3)|$regrm{$1},$imm); } - else - { &::generic("pextrd",@_); } -} - -sub ::pinsrd -{ my($dst,$src,$imm)=@_; - if ("$dst:$src" =~ /xmm([0-7]):(e[a-dsd][ixp])/) - { &::data_byte(0x66,0x0f,0x3a,0x22,0xc0|($1<<3)|$regrm{$2},$imm); } - else - { &::generic("pinsrd",@_); } -} - -sub ::pshufb -{ my($dst,$src)=@_; - if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) - { &data_byte(0x66,0x0f,0x38,0x00,0xc0|($1<<3)|$2); } - else - { &::generic("pshufb",@_); } -} - -sub ::palignr -{ my($dst,$src,$imm)=@_; - if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) - { &::data_byte(0x66,0x0f,0x3a,0x0f,0xc0|($1<<3)|$2,$imm); } - else - { &::generic("palignr",@_); } -} - -sub ::pclmulqdq -{ my($dst,$src,$imm)=@_; - if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) - { &::data_byte(0x66,0x0f,0x3a,0x44,0xc0|($1<<3)|$2,$imm); } - else - { &::generic("pclmulqdq",@_); } -} - -sub ::rdrand -{ my ($dst)=@_; - if ($dst =~ /(e[a-dsd][ixp])/) - { &::data_byte(0x0f,0xc7,0xf0|$regrm{$dst}); } - else - { &::generic("rdrand",@_); } -} - -sub ::rdseed -{ my ($dst)=@_; - if ($dst =~ /(e[a-dsd][ixp])/) - { &::data_byte(0x0f,0xc7,0xf8|$regrm{$dst}); } - else - { &::generic("rdrand",@_); } -} - sub rxb { local *opcode=shift; my ($dst,$src1,$src2,$rxb)=@_;
diff --git a/gen/bcm/aesni-x86-apple.S b/gen/bcm/aesni-x86-apple.S index db13057..7454d23 100644 --- a/gen/bcm/aesni-x86-apple.S +++ b/gen/bcm/aesni-x86-apple.S
@@ -638,7 +638,7 @@ movdqa %xmm7,%xmm2 leal 32(%edx,%ecx,1),%edx subl %ecx,%ebx -.byte 102,15,56,0,253 + pshufb %xmm5,%xmm7 L031ccm64_enc_outer: movups (%ebp),%xmm0 movl %ebx,%ecx @@ -667,7 +667,7 @@ xorps %xmm2,%xmm6 movdqa %xmm7,%xmm2 movups %xmm6,(%edi) -.byte 102,15,56,0,213 + pshufb %xmm5,%xmm2 leal 16(%edi),%edi jnz L031ccm64_enc_outer movl 48(%esp),%esp @@ -722,7 +722,7 @@ movdqa %xmm7,%xmm2 movl %edx,%ebp movl %ecx,%ebx -.byte 102,15,56,0,253 + pshufb %xmm5,%xmm7 movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx @@ -749,7 +749,7 @@ movdqa %xmm7,%xmm2 movups %xmm6,(%edi) leal 16(%edi),%edi -.byte 102,15,56,0,213 + pshufb %xmm5,%xmm2 subl $1,%eax jz L035ccm64_dec_break movups (%ebp),%xmm0 @@ -851,29 +851,29 @@ movl %ecx,20(%esp) movl %ecx,24(%esp) movl %ebp,28(%esp) -.byte 102,15,58,22,251,3 -.byte 102,15,58,34,253,3 + pextrd $3,%xmm7,%ebx + pinsrd $3,%ebp,%xmm7 movl 240(%edx),%ecx bswap %ebx pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 movdqa (%esp),%xmm2 -.byte 102,15,58,34,195,0 + pinsrd $0,%ebx,%xmm0 leal 3(%ebx),%ebp -.byte 102,15,58,34,205,0 + pinsrd $0,%ebp,%xmm1 incl %ebx -.byte 102,15,58,34,195,1 + pinsrd $1,%ebx,%xmm0 incl %ebp -.byte 102,15,58,34,205,1 + pinsrd $1,%ebp,%xmm1 incl %ebx -.byte 102,15,58,34,195,2 + pinsrd $2,%ebx,%xmm0 incl %ebp -.byte 102,15,58,34,205,2 + pinsrd $2,%ebp,%xmm1 movdqa %xmm0,48(%esp) -.byte 102,15,56,0,194 + pshufb %xmm2,%xmm0 movdqu (%edx),%xmm6 movdqa %xmm1,64(%esp) -.byte 102,15,56,0,202 + pshufb %xmm2,%xmm1 pshufd $192,%xmm0,%xmm2 pshufd $128,%xmm0,%xmm3 cmpl $6,%eax @@ -930,12 +930,12 @@ movups 80(%esi),%xmm3 leal 96(%esi),%esi movdqa %xmm0,48(%esp) -.byte 102,15,56,0,194 + pshufb %xmm2,%xmm0 xorps %xmm4,%xmm6 movups %xmm5,48(%edi) xorps %xmm3,%xmm7 movdqa %xmm1,64(%esp) -.byte 102,15,56,0,202 + pshufb %xmm2,%xmm1 movups %xmm6,64(%edi) pshufd $192,%xmm0,%xmm2 movups %xmm7,80(%edi) @@ -2309,7 +2309,7 @@ movdqa %xmm0,%xmm2 movdqu %xmm0,-16(%edx) L115loop_key128: -.byte 102,15,56,0,197 + pshufb %xmm5,%xmm0 aesenclast %xmm4,%xmm0 pslld $1,%xmm4 leal 16(%edx),%edx @@ -2326,7 +2326,7 @@ decl %ecx jnz L115loop_key128 movdqa 48(%ebx),%xmm4 -.byte 102,15,56,0,197 + pshufb %xmm5,%xmm0 aesenclast %xmm4,%xmm0 pslld $1,%xmm4 movdqa %xmm2,%xmm3 @@ -2339,7 +2339,7 @@ pxor %xmm2,%xmm0 movdqu %xmm0,(%edx) movdqa %xmm0,%xmm2 -.byte 102,15,56,0,197 + pshufb %xmm5,%xmm0 aesenclast %xmm4,%xmm0 movdqa %xmm2,%xmm3 pslldq $4,%xmm2 @@ -2363,7 +2363,7 @@ L117loop_key192: movq %xmm2,(%edx) movdqa %xmm2,%xmm1 -.byte 102,15,56,0,213 + pshufb %xmm5,%xmm2 aesenclast %xmm4,%xmm2 pslld $1,%xmm4 leal 24(%edx),%edx @@ -2397,7 +2397,7 @@ movdqa %xmm2,%xmm1 movdqu %xmm2,-16(%edx) L118loop_key256: -.byte 102,15,56,0,213 + pshufb %xmm5,%xmm2 aesenclast %xmm4,%xmm2 movdqa %xmm0,%xmm3 pslldq $4,%xmm0
diff --git a/gen/bcm/aesni-x86-linux.S b/gen/bcm/aesni-x86-linux.S index 6a7d93c..b6772c1 100644 --- a/gen/bcm/aesni-x86-linux.S +++ b/gen/bcm/aesni-x86-linux.S
@@ -661,7 +661,7 @@ movdqa %xmm7,%xmm2 leal 32(%edx,%ecx,1),%edx subl %ecx,%ebx -.byte 102,15,56,0,253 + pshufb %xmm5,%xmm7 .L031ccm64_enc_outer: movups (%ebp),%xmm0 movl %ebx,%ecx @@ -690,7 +690,7 @@ xorps %xmm2,%xmm6 movdqa %xmm7,%xmm2 movups %xmm6,(%edi) -.byte 102,15,56,0,213 + pshufb %xmm5,%xmm2 leal 16(%edi),%edi jnz .L031ccm64_enc_outer movl 48(%esp),%esp @@ -747,7 +747,7 @@ movdqa %xmm7,%xmm2 movl %edx,%ebp movl %ecx,%ebx -.byte 102,15,56,0,253 + pshufb %xmm5,%xmm7 movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx @@ -774,7 +774,7 @@ movdqa %xmm7,%xmm2 movups %xmm6,(%edi) leal 16(%edi),%edi -.byte 102,15,56,0,213 + pshufb %xmm5,%xmm2 subl $1,%eax jz .L035ccm64_dec_break movups (%ebp),%xmm0 @@ -878,29 +878,29 @@ movl %ecx,20(%esp) movl %ecx,24(%esp) movl %ebp,28(%esp) -.byte 102,15,58,22,251,3 -.byte 102,15,58,34,253,3 + pextrd $3,%xmm7,%ebx + pinsrd $3,%ebp,%xmm7 movl 240(%edx),%ecx bswap %ebx pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 movdqa (%esp),%xmm2 -.byte 102,15,58,34,195,0 + pinsrd $0,%ebx,%xmm0 leal 3(%ebx),%ebp -.byte 102,15,58,34,205,0 + pinsrd $0,%ebp,%xmm1 incl %ebx -.byte 102,15,58,34,195,1 + pinsrd $1,%ebx,%xmm0 incl %ebp -.byte 102,15,58,34,205,1 + pinsrd $1,%ebp,%xmm1 incl %ebx -.byte 102,15,58,34,195,2 + pinsrd $2,%ebx,%xmm0 incl %ebp -.byte 102,15,58,34,205,2 + pinsrd $2,%ebp,%xmm1 movdqa %xmm0,48(%esp) -.byte 102,15,56,0,194 + pshufb %xmm2,%xmm0 movdqu (%edx),%xmm6 movdqa %xmm1,64(%esp) -.byte 102,15,56,0,202 + pshufb %xmm2,%xmm1 pshufd $192,%xmm0,%xmm2 pshufd $128,%xmm0,%xmm3 cmpl $6,%eax @@ -957,12 +957,12 @@ movups 80(%esi),%xmm3 leal 96(%esi),%esi movdqa %xmm0,48(%esp) -.byte 102,15,56,0,194 + pshufb %xmm2,%xmm0 xorps %xmm4,%xmm6 movups %xmm5,48(%edi) xorps %xmm3,%xmm7 movdqa %xmm1,64(%esp) -.byte 102,15,56,0,202 + pshufb %xmm2,%xmm1 movups %xmm6,64(%edi) pshufd $192,%xmm0,%xmm2 movups %xmm7,80(%edi) @@ -2346,7 +2346,7 @@ movdqa %xmm0,%xmm2 movdqu %xmm0,-16(%edx) .L115loop_key128: -.byte 102,15,56,0,197 + pshufb %xmm5,%xmm0 aesenclast %xmm4,%xmm0 pslld $1,%xmm4 leal 16(%edx),%edx @@ -2363,7 +2363,7 @@ decl %ecx jnz .L115loop_key128 movdqa 48(%ebx),%xmm4 -.byte 102,15,56,0,197 + pshufb %xmm5,%xmm0 aesenclast %xmm4,%xmm0 pslld $1,%xmm4 movdqa %xmm2,%xmm3 @@ -2376,7 +2376,7 @@ pxor %xmm2,%xmm0 movdqu %xmm0,(%edx) movdqa %xmm0,%xmm2 -.byte 102,15,56,0,197 + pshufb %xmm5,%xmm0 aesenclast %xmm4,%xmm0 movdqa %xmm2,%xmm3 pslldq $4,%xmm2 @@ -2400,7 +2400,7 @@ .L117loop_key192: movq %xmm2,(%edx) movdqa %xmm2,%xmm1 -.byte 102,15,56,0,213 + pshufb %xmm5,%xmm2 aesenclast %xmm4,%xmm2 pslld $1,%xmm4 leal 24(%edx),%edx @@ -2434,7 +2434,7 @@ movdqa %xmm2,%xmm1 movdqu %xmm2,-16(%edx) .L118loop_key256: -.byte 102,15,56,0,213 + pshufb %xmm5,%xmm2 aesenclast %xmm4,%xmm2 movdqa %xmm0,%xmm3 pslldq $4,%xmm0
diff --git a/gen/bcm/aesni-x86-win.asm b/gen/bcm/aesni-x86-win.asm index 4bb5431..660b772 100644 --- a/gen/bcm/aesni-x86-win.asm +++ b/gen/bcm/aesni-x86-win.asm
@@ -635,7 +635,7 @@ movdqa xmm2,xmm7 lea edx,[32+ecx*1+edx] sub ebx,ecx -db 102,15,56,0,253 + pshufb xmm7,xmm5 L$031ccm64_enc_outer: movups xmm0,[ebp] mov ecx,ebx @@ -664,7 +664,7 @@ xorps xmm6,xmm2 movdqa xmm2,xmm7 movups [edi],xmm6 -db 102,15,56,0,213 + pshufb xmm2,xmm5 lea edi,[16+edi] jnz NEAR L$031ccm64_enc_outer mov esp,DWORD [48+esp] @@ -718,7 +718,7 @@ movdqa xmm2,xmm7 mov ebp,edx mov ebx,ecx -db 102,15,56,0,253 + pshufb xmm7,xmm5 movups xmm0,[edx] movups xmm1,[16+edx] lea edx,[32+edx] @@ -745,7 +745,7 @@ movdqa xmm2,xmm7 movups [edi],xmm6 lea edi,[16+edi] -db 102,15,56,0,213 + pshufb xmm2,xmm5 sub eax,1 jz NEAR L$035ccm64_dec_break movups xmm0,[ebp] @@ -846,29 +846,29 @@ mov DWORD [20+esp],ecx mov DWORD [24+esp],ecx mov DWORD [28+esp],ebp -db 102,15,58,22,251,3 -db 102,15,58,34,253,3 + pextrd ebx,xmm7,3 + pinsrd xmm7,ebp,3 mov ecx,DWORD [240+edx] bswap ebx pxor xmm0,xmm0 pxor xmm1,xmm1 movdqa xmm2,[esp] -db 102,15,58,34,195,0 + pinsrd xmm0,ebx,0 lea ebp,[3+ebx] -db 102,15,58,34,205,0 + pinsrd xmm1,ebp,0 inc ebx -db 102,15,58,34,195,1 + pinsrd xmm0,ebx,1 inc ebp -db 102,15,58,34,205,1 + pinsrd xmm1,ebp,1 inc ebx -db 102,15,58,34,195,2 + pinsrd xmm0,ebx,2 inc ebp -db 102,15,58,34,205,2 + pinsrd xmm1,ebp,2 movdqa [48+esp],xmm0 -db 102,15,56,0,194 + pshufb xmm0,xmm2 movdqu xmm6,[edx] movdqa [64+esp],xmm1 -db 102,15,56,0,202 + pshufb xmm1,xmm2 pshufd xmm2,xmm0,192 pshufd xmm3,xmm0,128 cmp eax,6 @@ -925,12 +925,12 @@ movups xmm3,[80+esi] lea esi,[96+esi] movdqa [48+esp],xmm0 -db 102,15,56,0,194 + pshufb xmm0,xmm2 xorps xmm6,xmm4 movups [48+edi],xmm5 xorps xmm7,xmm3 movdqa [64+esp],xmm1 -db 102,15,56,0,202 + pshufb xmm1,xmm2 movups [64+edi],xmm6 pshufd xmm2,xmm0,192 movups [80+edi],xmm7 @@ -2299,7 +2299,7 @@ movdqa xmm2,xmm0 movdqu [edx-16],xmm0 L$115loop_key128: -db 102,15,56,0,197 + pshufb xmm0,xmm5 aesenclast xmm0,xmm4 pslld xmm4,1 lea edx,[16+edx] @@ -2316,7 +2316,7 @@ dec ecx jnz NEAR L$115loop_key128 movdqa xmm4,[48+ebx] -db 102,15,56,0,197 + pshufb xmm0,xmm5 aesenclast xmm0,xmm4 pslld xmm4,1 movdqa xmm3,xmm2 @@ -2329,7 +2329,7 @@ pxor xmm0,xmm2 movdqu [edx],xmm0 movdqa xmm2,xmm0 -db 102,15,56,0,197 + pshufb xmm0,xmm5 aesenclast xmm0,xmm4 movdqa xmm3,xmm2 pslldq xmm2,4 @@ -2353,7 +2353,7 @@ L$117loop_key192: movq [edx],xmm2 movdqa xmm1,xmm2 -db 102,15,56,0,213 + pshufb xmm2,xmm5 aesenclast xmm2,xmm4 pslld xmm4,1 lea edx,[24+edx] @@ -2387,7 +2387,7 @@ movdqa xmm1,xmm2 movdqu [edx-16],xmm2 L$118loop_key256: -db 102,15,56,0,213 + pshufb xmm2,xmm5 aesenclast xmm2,xmm4 movdqa xmm3,xmm0 pslldq xmm0,4
diff --git a/gen/bcm/ghash-ssse3-x86-apple.S b/gen/bcm/ghash-ssse3-x86-apple.S index 96cb86f..b58a4fa 100644 --- a/gen/bcm/ghash-ssse3-x86-apple.S +++ b/gen/bcm/ghash-ssse3-x86-apple.S
@@ -22,7 +22,7 @@ popl %eax movdqa Lreverse_bytes-L000pic_point(%eax),%xmm7 movdqa Llow4_mask-L000pic_point(%eax),%xmm2 -.byte 102,15,56,0,199 + pshufb %xmm7,%xmm0 movdqa %xmm2,%xmm1 pandn %xmm0,%xmm1 psrld $4,%xmm1 @@ -34,12 +34,12 @@ movdqu (%esi),%xmm4 leal 16(%esi),%esi movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 + palignr $1,%xmm3,%xmm6 movdqa %xmm6,%xmm3 psrldq $1,%xmm2 movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 pxor %xmm5,%xmm2 movdqa %xmm4,%xmm5 psllq $60,%xmm5 @@ -65,12 +65,12 @@ movdqu (%esi),%xmm4 leal 16(%esi),%esi movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 + palignr $1,%xmm3,%xmm6 movdqa %xmm6,%xmm3 psrldq $1,%xmm2 movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 pxor %xmm5,%xmm2 movdqa %xmm4,%xmm5 psllq $60,%xmm5 @@ -96,12 +96,12 @@ movdqu (%esi),%xmm4 leal 16(%esi),%esi movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 + palignr $1,%xmm3,%xmm6 movdqa %xmm6,%xmm3 psrldq $1,%xmm2 movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 pxor %xmm5,%xmm2 movdqa %xmm4,%xmm5 psllq $60,%xmm5 @@ -122,7 +122,7 @@ psrlq $5,%xmm3 pxor %xmm3,%xmm2 pxor %xmm3,%xmm3 -.byte 102,15,56,0,215 + pshufb %xmm7,%xmm2 movdqu %xmm2,(%edi) pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 @@ -155,12 +155,12 @@ popl %ebx movdqa Lreverse_bytes-L004pic_point(%ebx),%xmm7 andl $-16,%ecx -.byte 102,15,56,0,199 + pshufb %xmm7,%xmm0 pxor %xmm3,%xmm3 L005loop_ghash: movdqa Llow4_mask-L004pic_point(%ebx),%xmm2 movdqu (%edx),%xmm1 -.byte 102,15,56,0,207 + pshufb %xmm7,%xmm1 pxor %xmm1,%xmm0 movdqa %xmm2,%xmm1 pandn %xmm0,%xmm1 @@ -172,12 +172,12 @@ movdqu (%esi),%xmm4 leal 16(%esi),%esi movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 + palignr $1,%xmm3,%xmm6 movdqa %xmm6,%xmm3 psrldq $1,%xmm2 movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 pxor %xmm5,%xmm2 movdqa %xmm4,%xmm5 psllq $60,%xmm5 @@ -203,12 +203,12 @@ movdqu (%esi),%xmm4 leal 16(%esi),%esi movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 + palignr $1,%xmm3,%xmm6 movdqa %xmm6,%xmm3 psrldq $1,%xmm2 movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 pxor %xmm5,%xmm2 movdqa %xmm4,%xmm5 psllq $60,%xmm5 @@ -234,12 +234,12 @@ movdqu (%esi),%xmm4 leal 16(%esi),%esi movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 + palignr $1,%xmm3,%xmm6 movdqa %xmm6,%xmm3 psrldq $1,%xmm2 movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 pxor %xmm5,%xmm2 movdqa %xmm4,%xmm5 psllq $60,%xmm5 @@ -265,7 +265,7 @@ leal 16(%edx),%edx subl $16,%ecx jnz L005loop_ghash -.byte 102,15,56,0,199 + pshufb %xmm7,%xmm0 movdqu %xmm0,(%edi) pxor %xmm0,%xmm0 pxor %xmm1,%xmm1
diff --git a/gen/bcm/ghash-ssse3-x86-linux.S b/gen/bcm/ghash-ssse3-x86-linux.S index 7fe65e7..7e8d7bb 100644 --- a/gen/bcm/ghash-ssse3-x86-linux.S +++ b/gen/bcm/ghash-ssse3-x86-linux.S
@@ -23,7 +23,7 @@ popl %eax movdqa .Lreverse_bytes-.L000pic_point(%eax),%xmm7 movdqa .Llow4_mask-.L000pic_point(%eax),%xmm2 -.byte 102,15,56,0,199 + pshufb %xmm7,%xmm0 movdqa %xmm2,%xmm1 pandn %xmm0,%xmm1 psrld $4,%xmm1 @@ -35,12 +35,12 @@ movdqu (%esi),%xmm4 leal 16(%esi),%esi movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 + palignr $1,%xmm3,%xmm6 movdqa %xmm6,%xmm3 psrldq $1,%xmm2 movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 pxor %xmm5,%xmm2 movdqa %xmm4,%xmm5 psllq $60,%xmm5 @@ -66,12 +66,12 @@ movdqu (%esi),%xmm4 leal 16(%esi),%esi movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 + palignr $1,%xmm3,%xmm6 movdqa %xmm6,%xmm3 psrldq $1,%xmm2 movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 pxor %xmm5,%xmm2 movdqa %xmm4,%xmm5 psllq $60,%xmm5 @@ -97,12 +97,12 @@ movdqu (%esi),%xmm4 leal 16(%esi),%esi movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 + palignr $1,%xmm3,%xmm6 movdqa %xmm6,%xmm3 psrldq $1,%xmm2 movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 pxor %xmm5,%xmm2 movdqa %xmm4,%xmm5 psllq $60,%xmm5 @@ -123,7 +123,7 @@ psrlq $5,%xmm3 pxor %xmm3,%xmm2 pxor %xmm3,%xmm3 -.byte 102,15,56,0,215 + pshufb %xmm7,%xmm2 movdqu %xmm2,(%edi) pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 @@ -158,12 +158,12 @@ popl %ebx movdqa .Lreverse_bytes-.L004pic_point(%ebx),%xmm7 andl $-16,%ecx -.byte 102,15,56,0,199 + pshufb %xmm7,%xmm0 pxor %xmm3,%xmm3 .L005loop_ghash: movdqa .Llow4_mask-.L004pic_point(%ebx),%xmm2 movdqu (%edx),%xmm1 -.byte 102,15,56,0,207 + pshufb %xmm7,%xmm1 pxor %xmm1,%xmm0 movdqa %xmm2,%xmm1 pandn %xmm0,%xmm1 @@ -175,12 +175,12 @@ movdqu (%esi),%xmm4 leal 16(%esi),%esi movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 + palignr $1,%xmm3,%xmm6 movdqa %xmm6,%xmm3 psrldq $1,%xmm2 movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 pxor %xmm5,%xmm2 movdqa %xmm4,%xmm5 psllq $60,%xmm5 @@ -206,12 +206,12 @@ movdqu (%esi),%xmm4 leal 16(%esi),%esi movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 + palignr $1,%xmm3,%xmm6 movdqa %xmm6,%xmm3 psrldq $1,%xmm2 movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 pxor %xmm5,%xmm2 movdqa %xmm4,%xmm5 psllq $60,%xmm5 @@ -237,12 +237,12 @@ movdqu (%esi),%xmm4 leal 16(%esi),%esi movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 + palignr $1,%xmm3,%xmm6 movdqa %xmm6,%xmm3 psrldq $1,%xmm2 movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 pxor %xmm5,%xmm2 movdqa %xmm4,%xmm5 psllq $60,%xmm5 @@ -268,7 +268,7 @@ leal 16(%edx),%edx subl $16,%ecx jnz .L005loop_ghash -.byte 102,15,56,0,199 + pshufb %xmm7,%xmm0 movdqu %xmm0,(%edi) pxor %xmm0,%xmm0 pxor %xmm1,%xmm1
diff --git a/gen/bcm/ghash-ssse3-x86-win.asm b/gen/bcm/ghash-ssse3-x86-win.asm index 1eca092..387b4a1 100644 --- a/gen/bcm/ghash-ssse3-x86-win.asm +++ b/gen/bcm/ghash-ssse3-x86-win.asm
@@ -29,7 +29,7 @@ pop eax movdqa xmm7,[(L$reverse_bytes-L$000pic_point)+eax] movdqa xmm2,[(L$low4_mask-L$000pic_point)+eax] -db 102,15,56,0,199 + pshufb xmm0,xmm7 movdqa xmm1,xmm2 pandn xmm1,xmm0 psrld xmm1,4 @@ -41,12 +41,12 @@ movdqu xmm4,[esi] lea esi,[16+esi] movdqa xmm6,xmm2 -db 102,15,58,15,243,1 + palignr xmm6,xmm3,1 movdqa xmm3,xmm6 psrldq xmm2,1 movdqa xmm5,xmm4 -db 102,15,56,0,224 -db 102,15,56,0,233 + pshufb xmm4,xmm0 + pshufb xmm5,xmm1 pxor xmm2,xmm5 movdqa xmm5,xmm4 psllq xmm5,60 @@ -72,12 +72,12 @@ movdqu xmm4,[esi] lea esi,[16+esi] movdqa xmm6,xmm2 -db 102,15,58,15,243,1 + palignr xmm6,xmm3,1 movdqa xmm3,xmm6 psrldq xmm2,1 movdqa xmm5,xmm4 -db 102,15,56,0,224 -db 102,15,56,0,233 + pshufb xmm4,xmm0 + pshufb xmm5,xmm1 pxor xmm2,xmm5 movdqa xmm5,xmm4 psllq xmm5,60 @@ -103,12 +103,12 @@ movdqu xmm4,[esi] lea esi,[16+esi] movdqa xmm6,xmm2 -db 102,15,58,15,243,1 + palignr xmm6,xmm3,1 movdqa xmm3,xmm6 psrldq xmm2,1 movdqa xmm5,xmm4 -db 102,15,56,0,224 -db 102,15,56,0,233 + pshufb xmm4,xmm0 + pshufb xmm5,xmm1 pxor xmm2,xmm5 movdqa xmm5,xmm4 psllq xmm5,60 @@ -129,7 +129,7 @@ psrlq xmm3,5 pxor xmm2,xmm3 pxor xmm3,xmm3 -db 102,15,56,0,215 + pshufb xmm2,xmm7 movdqu [edi],xmm2 pxor xmm0,xmm0 pxor xmm1,xmm1 @@ -161,12 +161,12 @@ pop ebx movdqa xmm7,[(L$reverse_bytes-L$004pic_point)+ebx] and ecx,-16 -db 102,15,56,0,199 + pshufb xmm0,xmm7 pxor xmm3,xmm3 L$005loop_ghash: movdqa xmm2,[(L$low4_mask-L$004pic_point)+ebx] movdqu xmm1,[edx] -db 102,15,56,0,207 + pshufb xmm1,xmm7 pxor xmm0,xmm1 movdqa xmm1,xmm2 pandn xmm1,xmm0 @@ -178,12 +178,12 @@ movdqu xmm4,[esi] lea esi,[16+esi] movdqa xmm6,xmm2 -db 102,15,58,15,243,1 + palignr xmm6,xmm3,1 movdqa xmm3,xmm6 psrldq xmm2,1 movdqa xmm5,xmm4 -db 102,15,56,0,224 -db 102,15,56,0,233 + pshufb xmm4,xmm0 + pshufb xmm5,xmm1 pxor xmm2,xmm5 movdqa xmm5,xmm4 psllq xmm5,60 @@ -209,12 +209,12 @@ movdqu xmm4,[esi] lea esi,[16+esi] movdqa xmm6,xmm2 -db 102,15,58,15,243,1 + palignr xmm6,xmm3,1 movdqa xmm3,xmm6 psrldq xmm2,1 movdqa xmm5,xmm4 -db 102,15,56,0,224 -db 102,15,56,0,233 + pshufb xmm4,xmm0 + pshufb xmm5,xmm1 pxor xmm2,xmm5 movdqa xmm5,xmm4 psllq xmm5,60 @@ -240,12 +240,12 @@ movdqu xmm4,[esi] lea esi,[16+esi] movdqa xmm6,xmm2 -db 102,15,58,15,243,1 + palignr xmm6,xmm3,1 movdqa xmm3,xmm6 psrldq xmm2,1 movdqa xmm5,xmm4 -db 102,15,56,0,224 -db 102,15,56,0,233 + pshufb xmm4,xmm0 + pshufb xmm5,xmm1 pxor xmm2,xmm5 movdqa xmm5,xmm4 psllq xmm5,60 @@ -271,7 +271,7 @@ lea edx,[16+edx] sub ecx,16 jnz NEAR L$005loop_ghash -db 102,15,56,0,199 + pshufb xmm0,xmm7 movdqu [edi],xmm0 pxor xmm0,xmm0 pxor xmm1,xmm1
diff --git a/gen/bcm/ghash-x86-apple.S b/gen/bcm/ghash-x86-apple.S index a178b74..6221487 100644 --- a/gen/bcm/ghash-x86-apple.S +++ b/gen/bcm/ghash-x86-apple.S
@@ -34,9 +34,9 @@ pshufd $78,%xmm2,%xmm4 pxor %xmm0,%xmm3 pxor %xmm2,%xmm4 -.byte 102,15,58,68,194,0 -.byte 102,15,58,68,202,17 -.byte 102,15,58,68,220,0 + pclmulqdq $0,%xmm2,%xmm0 + pclmulqdq $17,%xmm2,%xmm1 + pclmulqdq $0,%xmm4,%xmm3 xorps %xmm0,%xmm3 xorps %xmm1,%xmm3 movdqa %xmm3,%xmm4 @@ -70,7 +70,7 @@ movdqu %xmm2,(%edx) pxor %xmm0,%xmm4 movdqu %xmm0,16(%edx) -.byte 102,15,58,15,227,8 + palignr $8,%xmm3,%xmm4 movdqu %xmm4,32(%edx) ret .globl _gcm_gmult_clmul @@ -87,14 +87,14 @@ movdqu (%eax),%xmm0 movdqa (%ecx),%xmm5 movups (%edx),%xmm2 -.byte 102,15,56,0,197 + pshufb %xmm5,%xmm0 movups 32(%edx),%xmm4 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 pxor %xmm0,%xmm3 -.byte 102,15,58,68,194,0 -.byte 102,15,58,68,202,17 -.byte 102,15,58,68,220,0 + pclmulqdq $0,%xmm2,%xmm0 + pclmulqdq $17,%xmm2,%xmm1 + pclmulqdq $0,%xmm4,%xmm3 xorps %xmm0,%xmm3 xorps %xmm1,%xmm3 movdqa %xmm3,%xmm4 @@ -122,7 +122,7 @@ pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 -.byte 102,15,56,0,197 + pshufb %xmm5,%xmm0 movdqu %xmm0,(%eax) ret .globl _gcm_ghash_clmul @@ -145,22 +145,22 @@ movdqu (%eax),%xmm0 movdqa (%ecx),%xmm5 movdqu (%edx),%xmm2 -.byte 102,15,56,0,197 + pshufb %xmm5,%xmm0 subl $16,%ebx jz L003odd_tail movdqu (%esi),%xmm3 movdqu 16(%esi),%xmm6 -.byte 102,15,56,0,221 -.byte 102,15,56,0,245 + pshufb %xmm5,%xmm3 + pshufb %xmm5,%xmm6 movdqu 32(%edx),%xmm5 pxor %xmm3,%xmm0 pshufd $78,%xmm6,%xmm3 movdqa %xmm6,%xmm7 pxor %xmm6,%xmm3 leal 32(%esi),%esi -.byte 102,15,58,68,242,0 -.byte 102,15,58,68,250,17 -.byte 102,15,58,68,221,0 + pclmulqdq $0,%xmm2,%xmm6 + pclmulqdq $17,%xmm2,%xmm7 + pclmulqdq $0,%xmm5,%xmm3 movups 16(%edx),%xmm2 nop subl $32,%ebx @@ -172,9 +172,9 @@ movdqa %xmm0,%xmm1 pxor %xmm0,%xmm4 nop -.byte 102,15,58,68,194,0 -.byte 102,15,58,68,202,17 -.byte 102,15,58,68,229,16 + pclmulqdq $0,%xmm2,%xmm0 + pclmulqdq $17,%xmm2,%xmm1 + pclmulqdq $16,%xmm5,%xmm4 movups (%edx),%xmm2 xorps %xmm6,%xmm0 movdqa (%ecx),%xmm5 @@ -183,14 +183,14 @@ pxor %xmm0,%xmm3 movdqu 16(%esi),%xmm6 pxor %xmm1,%xmm3 -.byte 102,15,56,0,253 + pshufb %xmm5,%xmm7 pxor %xmm3,%xmm4 movdqa %xmm4,%xmm3 psrldq $8,%xmm4 pslldq $8,%xmm3 pxor %xmm4,%xmm1 pxor %xmm3,%xmm0 -.byte 102,15,56,0,245 + pshufb %xmm5,%xmm6 pxor %xmm7,%xmm1 movdqa %xmm6,%xmm7 movdqa %xmm0,%xmm4 @@ -199,7 +199,7 @@ pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 -.byte 102,15,58,68,242,0 + pclmulqdq $0,%xmm2,%xmm6 movups 32(%edx),%xmm5 psllq $57,%xmm0 movdqa %xmm0,%xmm3 @@ -212,14 +212,14 @@ psrlq $1,%xmm0 pxor %xmm7,%xmm3 pxor %xmm4,%xmm1 -.byte 102,15,58,68,250,17 + pclmulqdq $17,%xmm2,%xmm7 movups 16(%edx),%xmm2 pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 -.byte 102,15,58,68,221,0 + pclmulqdq $0,%xmm5,%xmm3 leal 32(%esi),%esi subl $32,%ebx ja L005mod_loop @@ -227,9 +227,9 @@ pshufd $78,%xmm0,%xmm4 movdqa %xmm0,%xmm1 pxor %xmm0,%xmm4 -.byte 102,15,58,68,194,0 -.byte 102,15,58,68,202,17 -.byte 102,15,58,68,229,16 + pclmulqdq $0,%xmm2,%xmm0 + pclmulqdq $17,%xmm2,%xmm1 + pclmulqdq $16,%xmm5,%xmm4 movdqa (%ecx),%xmm5 xorps %xmm6,%xmm0 xorps %xmm7,%xmm1 @@ -266,16 +266,16 @@ movups (%edx),%xmm2 L003odd_tail: movdqu (%esi),%xmm3 -.byte 102,15,56,0,221 + pshufb %xmm5,%xmm3 pxor %xmm3,%xmm0 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 pshufd $78,%xmm2,%xmm4 pxor %xmm0,%xmm3 pxor %xmm2,%xmm4 -.byte 102,15,58,68,194,0 -.byte 102,15,58,68,202,17 -.byte 102,15,58,68,220,0 + pclmulqdq $0,%xmm2,%xmm0 + pclmulqdq $17,%xmm2,%xmm1 + pclmulqdq $0,%xmm4,%xmm3 xorps %xmm0,%xmm3 xorps %xmm1,%xmm3 movdqa %xmm3,%xmm4 @@ -304,7 +304,7 @@ psrlq $1,%xmm0 pxor %xmm1,%xmm0 L006done: -.byte 102,15,56,0,197 + pshufb %xmm5,%xmm0 movdqu %xmm0,(%eax) popl %edi popl %esi
diff --git a/gen/bcm/ghash-x86-linux.S b/gen/bcm/ghash-x86-linux.S index c897efc..960eeff 100644 --- a/gen/bcm/ghash-x86-linux.S +++ b/gen/bcm/ghash-x86-linux.S
@@ -35,9 +35,9 @@ pshufd $78,%xmm2,%xmm4 pxor %xmm0,%xmm3 pxor %xmm2,%xmm4 -.byte 102,15,58,68,194,0 -.byte 102,15,58,68,202,17 -.byte 102,15,58,68,220,0 + pclmulqdq $0,%xmm2,%xmm0 + pclmulqdq $17,%xmm2,%xmm1 + pclmulqdq $0,%xmm4,%xmm3 xorps %xmm0,%xmm3 xorps %xmm1,%xmm3 movdqa %xmm3,%xmm4 @@ -71,7 +71,7 @@ movdqu %xmm2,(%edx) pxor %xmm0,%xmm4 movdqu %xmm0,16(%edx) -.byte 102,15,58,15,227,8 + palignr $8,%xmm3,%xmm4 movdqu %xmm4,32(%edx) ret .size gcm_init_clmul,.-.L_gcm_init_clmul_begin @@ -90,14 +90,14 @@ movdqu (%eax),%xmm0 movdqa (%ecx),%xmm5 movups (%edx),%xmm2 -.byte 102,15,56,0,197 + pshufb %xmm5,%xmm0 movups 32(%edx),%xmm4 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 pxor %xmm0,%xmm3 -.byte 102,15,58,68,194,0 -.byte 102,15,58,68,202,17 -.byte 102,15,58,68,220,0 + pclmulqdq $0,%xmm2,%xmm0 + pclmulqdq $17,%xmm2,%xmm1 + pclmulqdq $0,%xmm4,%xmm3 xorps %xmm0,%xmm3 xorps %xmm1,%xmm3 movdqa %xmm3,%xmm4 @@ -125,7 +125,7 @@ pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 -.byte 102,15,56,0,197 + pshufb %xmm5,%xmm0 movdqu %xmm0,(%eax) ret .size gcm_gmult_clmul,.-.L_gcm_gmult_clmul_begin @@ -150,22 +150,22 @@ movdqu (%eax),%xmm0 movdqa (%ecx),%xmm5 movdqu (%edx),%xmm2 -.byte 102,15,56,0,197 + pshufb %xmm5,%xmm0 subl $16,%ebx jz .L003odd_tail movdqu (%esi),%xmm3 movdqu 16(%esi),%xmm6 -.byte 102,15,56,0,221 -.byte 102,15,56,0,245 + pshufb %xmm5,%xmm3 + pshufb %xmm5,%xmm6 movdqu 32(%edx),%xmm5 pxor %xmm3,%xmm0 pshufd $78,%xmm6,%xmm3 movdqa %xmm6,%xmm7 pxor %xmm6,%xmm3 leal 32(%esi),%esi -.byte 102,15,58,68,242,0 -.byte 102,15,58,68,250,17 -.byte 102,15,58,68,221,0 + pclmulqdq $0,%xmm2,%xmm6 + pclmulqdq $17,%xmm2,%xmm7 + pclmulqdq $0,%xmm5,%xmm3 movups 16(%edx),%xmm2 nop subl $32,%ebx @@ -177,9 +177,9 @@ movdqa %xmm0,%xmm1 pxor %xmm0,%xmm4 nop -.byte 102,15,58,68,194,0 -.byte 102,15,58,68,202,17 -.byte 102,15,58,68,229,16 + pclmulqdq $0,%xmm2,%xmm0 + pclmulqdq $17,%xmm2,%xmm1 + pclmulqdq $16,%xmm5,%xmm4 movups (%edx),%xmm2 xorps %xmm6,%xmm0 movdqa (%ecx),%xmm5 @@ -188,14 +188,14 @@ pxor %xmm0,%xmm3 movdqu 16(%esi),%xmm6 pxor %xmm1,%xmm3 -.byte 102,15,56,0,253 + pshufb %xmm5,%xmm7 pxor %xmm3,%xmm4 movdqa %xmm4,%xmm3 psrldq $8,%xmm4 pslldq $8,%xmm3 pxor %xmm4,%xmm1 pxor %xmm3,%xmm0 -.byte 102,15,56,0,245 + pshufb %xmm5,%xmm6 pxor %xmm7,%xmm1 movdqa %xmm6,%xmm7 movdqa %xmm0,%xmm4 @@ -204,7 +204,7 @@ pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 -.byte 102,15,58,68,242,0 + pclmulqdq $0,%xmm2,%xmm6 movups 32(%edx),%xmm5 psllq $57,%xmm0 movdqa %xmm0,%xmm3 @@ -217,14 +217,14 @@ psrlq $1,%xmm0 pxor %xmm7,%xmm3 pxor %xmm4,%xmm1 -.byte 102,15,58,68,250,17 + pclmulqdq $17,%xmm2,%xmm7 movups 16(%edx),%xmm2 pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 -.byte 102,15,58,68,221,0 + pclmulqdq $0,%xmm5,%xmm3 leal 32(%esi),%esi subl $32,%ebx ja .L005mod_loop @@ -232,9 +232,9 @@ pshufd $78,%xmm0,%xmm4 movdqa %xmm0,%xmm1 pxor %xmm0,%xmm4 -.byte 102,15,58,68,194,0 -.byte 102,15,58,68,202,17 -.byte 102,15,58,68,229,16 + pclmulqdq $0,%xmm2,%xmm0 + pclmulqdq $17,%xmm2,%xmm1 + pclmulqdq $16,%xmm5,%xmm4 movdqa (%ecx),%xmm5 xorps %xmm6,%xmm0 xorps %xmm7,%xmm1 @@ -271,16 +271,16 @@ movups (%edx),%xmm2 .L003odd_tail: movdqu (%esi),%xmm3 -.byte 102,15,56,0,221 + pshufb %xmm5,%xmm3 pxor %xmm3,%xmm0 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 pshufd $78,%xmm2,%xmm4 pxor %xmm0,%xmm3 pxor %xmm2,%xmm4 -.byte 102,15,58,68,194,0 -.byte 102,15,58,68,202,17 -.byte 102,15,58,68,220,0 + pclmulqdq $0,%xmm2,%xmm0 + pclmulqdq $17,%xmm2,%xmm1 + pclmulqdq $0,%xmm4,%xmm3 xorps %xmm0,%xmm3 xorps %xmm1,%xmm3 movdqa %xmm3,%xmm4 @@ -309,7 +309,7 @@ psrlq $1,%xmm0 pxor %xmm1,%xmm0 .L006done: -.byte 102,15,56,0,197 + pshufb %xmm5,%xmm0 movdqu %xmm0,(%eax) popl %edi popl %esi
diff --git a/gen/bcm/ghash-x86-win.asm b/gen/bcm/ghash-x86-win.asm index d982fd6..64e8332 100644 --- a/gen/bcm/ghash-x86-win.asm +++ b/gen/bcm/ghash-x86-win.asm
@@ -41,9 +41,9 @@ pshufd xmm4,xmm2,78 pxor xmm3,xmm0 pxor xmm4,xmm2 -db 102,15,58,68,194,0 -db 102,15,58,68,202,17 -db 102,15,58,68,220,0 + pclmulqdq xmm0,xmm2,0 + pclmulqdq xmm1,xmm2,17 + pclmulqdq xmm3,xmm4,0 xorps xmm3,xmm0 xorps xmm3,xmm1 movdqa xmm4,xmm3 @@ -77,7 +77,7 @@ movdqu [edx],xmm2 pxor xmm4,xmm0 movdqu [16+edx],xmm0 -db 102,15,58,15,227,8 + palignr xmm4,xmm3,8 movdqu [32+edx],xmm4 ret global _gcm_gmult_clmul @@ -93,14 +93,14 @@ movdqu xmm0,[eax] movdqa xmm5,[ecx] movups xmm2,[edx] -db 102,15,56,0,197 + pshufb xmm0,xmm5 movups xmm4,[32+edx] movdqa xmm1,xmm0 pshufd xmm3,xmm0,78 pxor xmm3,xmm0 -db 102,15,58,68,194,0 -db 102,15,58,68,202,17 -db 102,15,58,68,220,0 + pclmulqdq xmm0,xmm2,0 + pclmulqdq xmm1,xmm2,17 + pclmulqdq xmm3,xmm4,0 xorps xmm3,xmm0 xorps xmm3,xmm1 movdqa xmm4,xmm3 @@ -128,7 +128,7 @@ pxor xmm0,xmm4 psrlq xmm0,1 pxor xmm0,xmm1 -db 102,15,56,0,197 + pshufb xmm0,xmm5 movdqu [eax],xmm0 ret global _gcm_ghash_clmul @@ -150,22 +150,22 @@ movdqu xmm0,[eax] movdqa xmm5,[ecx] movdqu xmm2,[edx] -db 102,15,56,0,197 + pshufb xmm0,xmm5 sub ebx,16 jz NEAR L$003odd_tail movdqu xmm3,[esi] movdqu xmm6,[16+esi] -db 102,15,56,0,221 -db 102,15,56,0,245 + pshufb xmm3,xmm5 + pshufb xmm6,xmm5 movdqu xmm5,[32+edx] pxor xmm0,xmm3 pshufd xmm3,xmm6,78 movdqa xmm7,xmm6 pxor xmm3,xmm6 lea esi,[32+esi] -db 102,15,58,68,242,0 -db 102,15,58,68,250,17 -db 102,15,58,68,221,0 + pclmulqdq xmm6,xmm2,0 + pclmulqdq xmm7,xmm2,17 + pclmulqdq xmm3,xmm5,0 movups xmm2,[16+edx] nop sub ebx,32 @@ -177,9 +177,9 @@ movdqa xmm1,xmm0 pxor xmm4,xmm0 nop -db 102,15,58,68,194,0 -db 102,15,58,68,202,17 -db 102,15,58,68,229,16 + pclmulqdq xmm0,xmm2,0 + pclmulqdq xmm1,xmm2,17 + pclmulqdq xmm4,xmm5,16 movups xmm2,[edx] xorps xmm0,xmm6 movdqa xmm5,[ecx] @@ -188,14 +188,14 @@ pxor xmm3,xmm0 movdqu xmm6,[16+esi] pxor xmm3,xmm1 -db 102,15,56,0,253 + pshufb xmm7,xmm5 pxor xmm4,xmm3 movdqa xmm3,xmm4 psrldq xmm4,8 pslldq xmm3,8 pxor xmm1,xmm4 pxor xmm0,xmm3 -db 102,15,56,0,245 + pshufb xmm6,xmm5 pxor xmm1,xmm7 movdqa xmm7,xmm6 movdqa xmm4,xmm0 @@ -204,7 +204,7 @@ pxor xmm3,xmm0 psllq xmm0,1 pxor xmm0,xmm3 -db 102,15,58,68,242,0 + pclmulqdq xmm6,xmm2,0 movups xmm5,[32+edx] psllq xmm0,57 movdqa xmm3,xmm0 @@ -217,14 +217,14 @@ psrlq xmm0,1 pxor xmm3,xmm7 pxor xmm1,xmm4 -db 102,15,58,68,250,17 + pclmulqdq xmm7,xmm2,17 movups xmm2,[16+edx] pxor xmm4,xmm0 psrlq xmm0,5 pxor xmm0,xmm4 psrlq xmm0,1 pxor xmm0,xmm1 -db 102,15,58,68,221,0 + pclmulqdq xmm3,xmm5,0 lea esi,[32+esi] sub ebx,32 ja NEAR L$005mod_loop @@ -232,9 +232,9 @@ pshufd xmm4,xmm0,78 movdqa xmm1,xmm0 pxor xmm4,xmm0 -db 102,15,58,68,194,0 -db 102,15,58,68,202,17 -db 102,15,58,68,229,16 + pclmulqdq xmm0,xmm2,0 + pclmulqdq xmm1,xmm2,17 + pclmulqdq xmm4,xmm5,16 movdqa xmm5,[ecx] xorps xmm0,xmm6 xorps xmm1,xmm7 @@ -271,16 +271,16 @@ movups xmm2,[edx] L$003odd_tail: movdqu xmm3,[esi] -db 102,15,56,0,221 + pshufb xmm3,xmm5 pxor xmm0,xmm3 movdqa xmm1,xmm0 pshufd xmm3,xmm0,78 pshufd xmm4,xmm2,78 pxor xmm3,xmm0 pxor xmm4,xmm2 -db 102,15,58,68,194,0 -db 102,15,58,68,202,17 -db 102,15,58,68,220,0 + pclmulqdq xmm0,xmm2,0 + pclmulqdq xmm1,xmm2,17 + pclmulqdq xmm3,xmm4,0 xorps xmm3,xmm0 xorps xmm3,xmm1 movdqa xmm4,xmm3 @@ -309,7 +309,7 @@ psrlq xmm0,1 pxor xmm0,xmm1 L$006done: -db 102,15,56,0,197 + pshufb xmm0,xmm5 movdqu [eax],xmm0 pop edi pop esi
diff --git a/gen/bcm/sha1-586-apple.S b/gen/bcm/sha1-586-apple.S index f0ab02b..f2c45ec 100644 --- a/gen/bcm/sha1-586-apple.S +++ b/gen/bcm/sha1-586-apple.S
@@ -1424,11 +1424,11 @@ movdqu -48(%ebp),%xmm1 movdqu -32(%ebp),%xmm2 movdqu -16(%ebp),%xmm3 -.byte 102,15,56,0,198 -.byte 102,15,56,0,206 -.byte 102,15,56,0,214 + pshufb %xmm6,%xmm0 + pshufb %xmm6,%xmm1 + pshufb %xmm6,%xmm2 movdqa %xmm7,96(%esp) -.byte 102,15,56,0,222 + pshufb %xmm6,%xmm3 paddd %xmm7,%xmm0 paddd %xmm7,%xmm1 paddd %xmm7,%xmm2 @@ -2355,7 +2355,7 @@ movdqu 32(%ebp),%xmm2 movdqu 48(%ebp),%xmm3 addl $64,%ebp -.byte 102,15,56,0,198 + pshufb %xmm6,%xmm0 movl %ebp,196(%esp) movdqa %xmm7,96(%esp) addl 16(%esp),%ebx @@ -2365,7 +2365,7 @@ addl %esi,%ebx xorl %edi,%ebp rorl $7,%edx -.byte 102,15,56,0,206 + pshufb %xmm6,%xmm1 addl %ecx,%ebx addl 20(%esp),%eax xorl %edx,%ebp @@ -2401,7 +2401,7 @@ addl %esi,%ecx xorl %eax,%ebp rorl $7,%edi -.byte 102,15,56,0,214 + pshufb %xmm6,%xmm2 addl %edx,%ecx addl 36(%esp),%ebx xorl %edi,%ebp @@ -2437,7 +2437,7 @@ addl %esi,%edx xorl %ebx,%ebp rorl $7,%eax -.byte 102,15,56,0,222 + pshufb %xmm6,%xmm3 addl %edi,%edx addl 52(%esp),%ecx xorl %eax,%ebp
diff --git a/gen/bcm/sha1-586-linux.S b/gen/bcm/sha1-586-linux.S index 0e5754f..3d8d213 100644 --- a/gen/bcm/sha1-586-linux.S +++ b/gen/bcm/sha1-586-linux.S
@@ -1427,11 +1427,11 @@ movdqu -48(%ebp),%xmm1 movdqu -32(%ebp),%xmm2 movdqu -16(%ebp),%xmm3 -.byte 102,15,56,0,198 -.byte 102,15,56,0,206 -.byte 102,15,56,0,214 + pshufb %xmm6,%xmm0 + pshufb %xmm6,%xmm1 + pshufb %xmm6,%xmm2 movdqa %xmm7,96(%esp) -.byte 102,15,56,0,222 + pshufb %xmm6,%xmm3 paddd %xmm7,%xmm0 paddd %xmm7,%xmm1 paddd %xmm7,%xmm2 @@ -2358,7 +2358,7 @@ movdqu 32(%ebp),%xmm2 movdqu 48(%ebp),%xmm3 addl $64,%ebp -.byte 102,15,56,0,198 + pshufb %xmm6,%xmm0 movl %ebp,196(%esp) movdqa %xmm7,96(%esp) addl 16(%esp),%ebx @@ -2368,7 +2368,7 @@ addl %esi,%ebx xorl %edi,%ebp rorl $7,%edx -.byte 102,15,56,0,206 + pshufb %xmm6,%xmm1 addl %ecx,%ebx addl 20(%esp),%eax xorl %edx,%ebp @@ -2404,7 +2404,7 @@ addl %esi,%ecx xorl %eax,%ebp rorl $7,%edi -.byte 102,15,56,0,214 + pshufb %xmm6,%xmm2 addl %edx,%ecx addl 36(%esp),%ebx xorl %edi,%ebp @@ -2440,7 +2440,7 @@ addl %esi,%edx xorl %ebx,%ebp rorl $7,%eax -.byte 102,15,56,0,222 + pshufb %xmm6,%xmm3 addl %edi,%edx addl 52(%esp),%ecx xorl %eax,%ebp
diff --git a/gen/bcm/sha1-586-win.asm b/gen/bcm/sha1-586-win.asm index a4e4875..7f0b44b 100644 --- a/gen/bcm/sha1-586-win.asm +++ b/gen/bcm/sha1-586-win.asm
@@ -1430,11 +1430,11 @@ movdqu xmm1,[ebp-48] movdqu xmm2,[ebp-32] movdqu xmm3,[ebp-16] -db 102,15,56,0,198 -db 102,15,56,0,206 -db 102,15,56,0,214 + pshufb xmm0,xmm6 + pshufb xmm1,xmm6 + pshufb xmm2,xmm6 movdqa [96+esp],xmm7 -db 102,15,56,0,222 + pshufb xmm3,xmm6 paddd xmm0,xmm7 paddd xmm1,xmm7 paddd xmm2,xmm7 @@ -2361,7 +2361,7 @@ movdqu xmm2,[32+ebp] movdqu xmm3,[48+ebp] add ebp,64 -db 102,15,56,0,198 + pshufb xmm0,xmm6 mov DWORD [196+esp],ebp movdqa [96+esp],xmm7 add ebx,DWORD [16+esp] @@ -2371,7 +2371,7 @@ add ebx,esi xor ebp,edi ror edx,7 -db 102,15,56,0,206 + pshufb xmm1,xmm6 add ebx,ecx add eax,DWORD [20+esp] xor ebp,edx @@ -2407,7 +2407,7 @@ add ecx,esi xor ebp,eax ror edi,7 -db 102,15,56,0,214 + pshufb xmm2,xmm6 add ecx,edx add ebx,DWORD [36+esp] xor ebp,edi @@ -2443,7 +2443,7 @@ add edx,esi xor ebp,ebx ror eax,7 -db 102,15,56,0,222 + pshufb xmm3,xmm6 add edx,edi add ecx,DWORD [52+esp] xor ebp,eax
diff --git a/gen/bcm/sha256-586-apple.S b/gen/bcm/sha256-586-apple.S index 8e74e68..66107d6 100644 --- a/gen/bcm/sha256-586-apple.S +++ b/gen/bcm/sha256-586-apple.S
@@ -3203,14 +3203,14 @@ movdqu 32(%edi),%xmm2 movdqu 48(%edi),%xmm3 addl $64,%edi -.byte 102,15,56,0,199 + pshufb %xmm7,%xmm0 movl %edi,100(%esp) -.byte 102,15,56,0,207 + pshufb %xmm7,%xmm1 movdqa (%ebp),%xmm4 -.byte 102,15,56,0,215 + pshufb %xmm7,%xmm2 movdqa 16(%ebp),%xmm5 paddd %xmm0,%xmm4 -.byte 102,15,56,0,223 + pshufb %xmm7,%xmm3 movdqa 32(%ebp),%xmm6 paddd %xmm1,%xmm5 movdqa 48(%ebp),%xmm7 @@ -3231,11 +3231,11 @@ movdqa %xmm3,%xmm7 xorl %ecx,%edx movl 24(%esp),%edi -.byte 102,15,58,15,224,4 + palignr $4,%xmm0,%xmm4 xorl %edi,%esi rorl $5,%edx andl %ecx,%esi -.byte 102,15,58,15,250,4 + palignr $4,%xmm2,%xmm7 movl %ecx,16(%esp) xorl %ecx,%edx xorl %esi,%edi @@ -3391,11 +3391,11 @@ movdqa %xmm0,%xmm7 xorl %ecx,%edx movl 8(%esp),%edi -.byte 102,15,58,15,225,4 + palignr $4,%xmm1,%xmm4 xorl %edi,%esi rorl $5,%edx andl %ecx,%esi -.byte 102,15,58,15,251,4 + palignr $4,%xmm3,%xmm7 movl %ecx,(%esp) xorl %ecx,%edx xorl %esi,%edi @@ -3551,11 +3551,11 @@ movdqa %xmm1,%xmm7 xorl %ecx,%edx movl 24(%esp),%edi -.byte 102,15,58,15,226,4 + palignr $4,%xmm2,%xmm4 xorl %edi,%esi rorl $5,%edx andl %ecx,%esi -.byte 102,15,58,15,248,4 + palignr $4,%xmm0,%xmm7 movl %ecx,16(%esp) xorl %ecx,%edx xorl %esi,%edi @@ -3711,11 +3711,11 @@ movdqa %xmm2,%xmm7 xorl %ecx,%edx movl 8(%esp),%edi -.byte 102,15,58,15,227,4 + palignr $4,%xmm3,%xmm4 xorl %edi,%esi rorl $5,%edx andl %ecx,%esi -.byte 102,15,58,15,249,4 + palignr $4,%xmm1,%xmm7 movl %ecx,(%esp) xorl %ecx,%edx xorl %esi,%edi
diff --git a/gen/bcm/sha256-586-linux.S b/gen/bcm/sha256-586-linux.S index 41b3759..d409651 100644 --- a/gen/bcm/sha256-586-linux.S +++ b/gen/bcm/sha256-586-linux.S
@@ -3206,14 +3206,14 @@ movdqu 32(%edi),%xmm2 movdqu 48(%edi),%xmm3 addl $64,%edi -.byte 102,15,56,0,199 + pshufb %xmm7,%xmm0 movl %edi,100(%esp) -.byte 102,15,56,0,207 + pshufb %xmm7,%xmm1 movdqa (%ebp),%xmm4 -.byte 102,15,56,0,215 + pshufb %xmm7,%xmm2 movdqa 16(%ebp),%xmm5 paddd %xmm0,%xmm4 -.byte 102,15,56,0,223 + pshufb %xmm7,%xmm3 movdqa 32(%ebp),%xmm6 paddd %xmm1,%xmm5 movdqa 48(%ebp),%xmm7 @@ -3234,11 +3234,11 @@ movdqa %xmm3,%xmm7 xorl %ecx,%edx movl 24(%esp),%edi -.byte 102,15,58,15,224,4 + palignr $4,%xmm0,%xmm4 xorl %edi,%esi rorl $5,%edx andl %ecx,%esi -.byte 102,15,58,15,250,4 + palignr $4,%xmm2,%xmm7 movl %ecx,16(%esp) xorl %ecx,%edx xorl %esi,%edi @@ -3394,11 +3394,11 @@ movdqa %xmm0,%xmm7 xorl %ecx,%edx movl 8(%esp),%edi -.byte 102,15,58,15,225,4 + palignr $4,%xmm1,%xmm4 xorl %edi,%esi rorl $5,%edx andl %ecx,%esi -.byte 102,15,58,15,251,4 + palignr $4,%xmm3,%xmm7 movl %ecx,(%esp) xorl %ecx,%edx xorl %esi,%edi @@ -3554,11 +3554,11 @@ movdqa %xmm1,%xmm7 xorl %ecx,%edx movl 24(%esp),%edi -.byte 102,15,58,15,226,4 + palignr $4,%xmm2,%xmm4 xorl %edi,%esi rorl $5,%edx andl %ecx,%esi -.byte 102,15,58,15,248,4 + palignr $4,%xmm0,%xmm7 movl %ecx,16(%esp) xorl %ecx,%edx xorl %esi,%edi @@ -3714,11 +3714,11 @@ movdqa %xmm2,%xmm7 xorl %ecx,%edx movl 8(%esp),%edi -.byte 102,15,58,15,227,4 + palignr $4,%xmm3,%xmm4 xorl %edi,%esi rorl $5,%edx andl %ecx,%esi -.byte 102,15,58,15,249,4 + palignr $4,%xmm1,%xmm7 movl %ecx,(%esp) xorl %ecx,%edx xorl %esi,%edi
diff --git a/gen/bcm/sha256-586-win.asm b/gen/bcm/sha256-586-win.asm index 8878695..b6fed17 100644 --- a/gen/bcm/sha256-586-win.asm +++ b/gen/bcm/sha256-586-win.asm
@@ -3209,14 +3209,14 @@ movdqu xmm2,[32+edi] movdqu xmm3,[48+edi] add edi,64 -db 102,15,56,0,199 + pshufb xmm0,xmm7 mov DWORD [100+esp],edi -db 102,15,56,0,207 + pshufb xmm1,xmm7 movdqa xmm4,[ebp] -db 102,15,56,0,215 + pshufb xmm2,xmm7 movdqa xmm5,[16+ebp] paddd xmm4,xmm0 -db 102,15,56,0,223 + pshufb xmm3,xmm7 movdqa xmm6,[32+ebp] paddd xmm5,xmm1 movdqa xmm7,[48+ebp] @@ -3237,11 +3237,11 @@ movdqa xmm7,xmm3 xor edx,ecx mov edi,DWORD [24+esp] -db 102,15,58,15,224,4 + palignr xmm4,xmm0,4 xor esi,edi ror edx,5 and esi,ecx -db 102,15,58,15,250,4 + palignr xmm7,xmm2,4 mov DWORD [16+esp],ecx xor edx,ecx xor edi,esi @@ -3397,11 +3397,11 @@ movdqa xmm7,xmm0 xor edx,ecx mov edi,DWORD [8+esp] -db 102,15,58,15,225,4 + palignr xmm4,xmm1,4 xor esi,edi ror edx,5 and esi,ecx -db 102,15,58,15,251,4 + palignr xmm7,xmm3,4 mov DWORD [esp],ecx xor edx,ecx xor edi,esi @@ -3557,11 +3557,11 @@ movdqa xmm7,xmm1 xor edx,ecx mov edi,DWORD [24+esp] -db 102,15,58,15,226,4 + palignr xmm4,xmm2,4 xor esi,edi ror edx,5 and esi,ecx -db 102,15,58,15,248,4 + palignr xmm7,xmm0,4 mov DWORD [16+esp],ecx xor edx,ecx xor edi,esi @@ -3717,11 +3717,11 @@ movdqa xmm7,xmm2 xor edx,ecx mov edi,DWORD [8+esp] -db 102,15,58,15,227,4 + palignr xmm4,xmm3,4 xor esi,edi ror edx,5 and esi,ecx -db 102,15,58,15,249,4 + palignr xmm7,xmm1,4 mov DWORD [esp],ecx xor edx,ecx xor edi,esi
diff --git a/gen/bcm/sha512-586-apple.S b/gen/bcm/sha512-586-apple.S index 785eaf5..be41827 100644 --- a/gen/bcm/sha512-586-apple.S +++ b/gen/bcm/sha512-586-apple.S
@@ -404,50 +404,50 @@ subl $256,%esp movdqa 640(%ebp),%xmm1 movdqu (%edi),%xmm0 -.byte 102,15,56,0,193 + pshufb %xmm1,%xmm0 movdqa (%ebp),%xmm3 movdqa %xmm1,%xmm2 movdqu 16(%edi),%xmm1 paddq %xmm0,%xmm3 -.byte 102,15,56,0,202 + pshufb %xmm2,%xmm1 movdqa %xmm3,-128(%edx) movdqa 16(%ebp),%xmm4 movdqa %xmm2,%xmm3 movdqu 32(%edi),%xmm2 paddq %xmm1,%xmm4 -.byte 102,15,56,0,211 + pshufb %xmm3,%xmm2 movdqa %xmm4,-112(%edx) movdqa 32(%ebp),%xmm5 movdqa %xmm3,%xmm4 movdqu 48(%edi),%xmm3 paddq %xmm2,%xmm5 -.byte 102,15,56,0,220 + pshufb %xmm4,%xmm3 movdqa %xmm5,-96(%edx) movdqa 48(%ebp),%xmm6 movdqa %xmm4,%xmm5 movdqu 64(%edi),%xmm4 paddq %xmm3,%xmm6 -.byte 102,15,56,0,229 + pshufb %xmm5,%xmm4 movdqa %xmm6,-80(%edx) movdqa 64(%ebp),%xmm7 movdqa %xmm5,%xmm6 movdqu 80(%edi),%xmm5 paddq %xmm4,%xmm7 -.byte 102,15,56,0,238 + pshufb %xmm6,%xmm5 movdqa %xmm7,-64(%edx) movdqa %xmm0,(%edx) movdqa 80(%ebp),%xmm0 movdqa %xmm6,%xmm7 movdqu 96(%edi),%xmm6 paddq %xmm5,%xmm0 -.byte 102,15,56,0,247 + pshufb %xmm7,%xmm6 movdqa %xmm0,-48(%edx) movdqa %xmm1,16(%edx) movdqa 96(%ebp),%xmm1 movdqa %xmm7,%xmm0 movdqu 112(%edi),%xmm7 paddq %xmm6,%xmm1 -.byte 102,15,56,0,248 + pshufb %xmm0,%xmm7 movdqa %xmm1,-32(%edx) movdqa %xmm2,32(%edx) movdqa 112(%ebp),%xmm2 @@ -478,9 +478,9 @@ L00600_47_ssse3: movdqa %xmm5,%xmm3 movdqa %xmm2,%xmm1 -.byte 102,15,58,15,208,8 + palignr $8,%xmm0,%xmm2 movdqa %xmm4,(%edx) -.byte 102,15,58,15,220,8 + palignr $8,%xmm4,%xmm3 movdqa %xmm2,%xmm4 psrlq $7,%xmm2 paddq %xmm3,%xmm0 @@ -605,9 +605,9 @@ movdqa %xmm2,-128(%edx) movdqa %xmm6,%xmm4 movdqa %xmm3,%xmm2 -.byte 102,15,58,15,217,8 + palignr $8,%xmm1,%xmm3 movdqa %xmm5,16(%edx) -.byte 102,15,58,15,229,8 + palignr $8,%xmm5,%xmm4 movdqa %xmm3,%xmm5 psrlq $7,%xmm3 paddq %xmm4,%xmm1 @@ -732,9 +732,9 @@ movdqa %xmm3,-112(%edx) movdqa %xmm7,%xmm5 movdqa %xmm4,%xmm3 -.byte 102,15,58,15,226,8 + palignr $8,%xmm2,%xmm4 movdqa %xmm6,32(%edx) -.byte 102,15,58,15,238,8 + palignr $8,%xmm6,%xmm5 movdqa %xmm4,%xmm6 psrlq $7,%xmm4 paddq %xmm5,%xmm2 @@ -859,9 +859,9 @@ movdqa %xmm4,-96(%edx) movdqa %xmm0,%xmm6 movdqa %xmm5,%xmm4 -.byte 102,15,58,15,235,8 + palignr $8,%xmm3,%xmm5 movdqa %xmm7,48(%edx) -.byte 102,15,58,15,247,8 + palignr $8,%xmm7,%xmm6 movdqa %xmm5,%xmm7 psrlq $7,%xmm5 paddq %xmm6,%xmm3 @@ -986,9 +986,9 @@ movdqa %xmm5,-80(%edx) movdqa %xmm1,%xmm7 movdqa %xmm6,%xmm5 -.byte 102,15,58,15,244,8 + palignr $8,%xmm4,%xmm6 movdqa %xmm0,(%edx) -.byte 102,15,58,15,248,8 + palignr $8,%xmm0,%xmm7 movdqa %xmm6,%xmm0 psrlq $7,%xmm6 paddq %xmm7,%xmm4 @@ -1113,9 +1113,9 @@ movdqa %xmm6,-64(%edx) movdqa %xmm2,%xmm0 movdqa %xmm7,%xmm6 -.byte 102,15,58,15,253,8 + palignr $8,%xmm5,%xmm7 movdqa %xmm1,16(%edx) -.byte 102,15,58,15,193,8 + palignr $8,%xmm1,%xmm0 movdqa %xmm7,%xmm1 psrlq $7,%xmm7 paddq %xmm0,%xmm5 @@ -1240,9 +1240,9 @@ movdqa %xmm7,-48(%edx) movdqa %xmm3,%xmm1 movdqa %xmm0,%xmm7 -.byte 102,15,58,15,198,8 + palignr $8,%xmm6,%xmm0 movdqa %xmm2,32(%edx) -.byte 102,15,58,15,202,8 + palignr $8,%xmm2,%xmm1 movdqa %xmm0,%xmm2 psrlq $7,%xmm0 paddq %xmm1,%xmm6 @@ -1367,9 +1367,9 @@ movdqa %xmm0,-32(%edx) movdqa %xmm4,%xmm2 movdqa %xmm1,%xmm0 -.byte 102,15,58,15,207,8 + palignr $8,%xmm7,%xmm1 movdqa %xmm3,48(%edx) -.byte 102,15,58,15,211,8 + palignr $8,%xmm3,%xmm2 movdqa %xmm1,%xmm3 psrlq $7,%xmm1 paddq %xmm2,%xmm7 @@ -1498,12 +1498,12 @@ movdqa (%ebp),%xmm1 leal -640(%ebp),%ebp movdqu (%ebx),%xmm0 -.byte 102,15,56,0,193 + pshufb %xmm1,%xmm0 movdqa (%ebp),%xmm3 movdqa %xmm1,%xmm2 movdqu 16(%ebx),%xmm1 paddq %xmm0,%xmm3 -.byte 102,15,56,0,202 + pshufb %xmm2,%xmm1 movq %mm4,%mm1 movq -128(%edx),%mm7 pxor %mm6,%mm5 @@ -1601,7 +1601,7 @@ movdqa %xmm2,%xmm3 movdqu 32(%ebx),%xmm2 paddq %xmm1,%xmm4 -.byte 102,15,56,0,211 + pshufb %xmm3,%xmm2 movq %mm4,%mm1 movq -112(%edx),%mm7 pxor %mm6,%mm5 @@ -1699,7 +1699,7 @@ movdqa %xmm3,%xmm4 movdqu 48(%ebx),%xmm3 paddq %xmm2,%xmm5 -.byte 102,15,56,0,220 + pshufb %xmm4,%xmm3 movq %mm4,%mm1 movq -96(%edx),%mm7 pxor %mm6,%mm5 @@ -1797,7 +1797,7 @@ movdqa %xmm4,%xmm5 movdqu 64(%ebx),%xmm4 paddq %xmm3,%xmm6 -.byte 102,15,56,0,229 + pshufb %xmm5,%xmm4 movq %mm4,%mm1 movq -80(%edx),%mm7 pxor %mm6,%mm5 @@ -1895,7 +1895,7 @@ movdqa %xmm5,%xmm6 movdqu 80(%ebx),%xmm5 paddq %xmm4,%xmm7 -.byte 102,15,56,0,238 + pshufb %xmm6,%xmm5 movq %mm4,%mm1 movq -64(%edx),%mm7 pxor %mm6,%mm5 @@ -1994,7 +1994,7 @@ movdqa %xmm6,%xmm7 movdqu 96(%ebx),%xmm6 paddq %xmm5,%xmm0 -.byte 102,15,56,0,247 + pshufb %xmm7,%xmm6 movq %mm4,%mm1 movq -48(%edx),%mm7 pxor %mm6,%mm5 @@ -2093,7 +2093,7 @@ movdqa %xmm7,%xmm0 movdqu 112(%ebx),%xmm7 paddq %xmm6,%xmm1 -.byte 102,15,56,0,248 + pshufb %xmm0,%xmm7 movq %mm4,%mm1 movq -32(%edx),%mm7 pxor %mm6,%mm5
diff --git a/gen/bcm/sha512-586-linux.S b/gen/bcm/sha512-586-linux.S index e82bd00..ebeb87d 100644 --- a/gen/bcm/sha512-586-linux.S +++ b/gen/bcm/sha512-586-linux.S
@@ -407,50 +407,50 @@ subl $256,%esp movdqa 640(%ebp),%xmm1 movdqu (%edi),%xmm0 -.byte 102,15,56,0,193 + pshufb %xmm1,%xmm0 movdqa (%ebp),%xmm3 movdqa %xmm1,%xmm2 movdqu 16(%edi),%xmm1 paddq %xmm0,%xmm3 -.byte 102,15,56,0,202 + pshufb %xmm2,%xmm1 movdqa %xmm3,-128(%edx) movdqa 16(%ebp),%xmm4 movdqa %xmm2,%xmm3 movdqu 32(%edi),%xmm2 paddq %xmm1,%xmm4 -.byte 102,15,56,0,211 + pshufb %xmm3,%xmm2 movdqa %xmm4,-112(%edx) movdqa 32(%ebp),%xmm5 movdqa %xmm3,%xmm4 movdqu 48(%edi),%xmm3 paddq %xmm2,%xmm5 -.byte 102,15,56,0,220 + pshufb %xmm4,%xmm3 movdqa %xmm5,-96(%edx) movdqa 48(%ebp),%xmm6 movdqa %xmm4,%xmm5 movdqu 64(%edi),%xmm4 paddq %xmm3,%xmm6 -.byte 102,15,56,0,229 + pshufb %xmm5,%xmm4 movdqa %xmm6,-80(%edx) movdqa 64(%ebp),%xmm7 movdqa %xmm5,%xmm6 movdqu 80(%edi),%xmm5 paddq %xmm4,%xmm7 -.byte 102,15,56,0,238 + pshufb %xmm6,%xmm5 movdqa %xmm7,-64(%edx) movdqa %xmm0,(%edx) movdqa 80(%ebp),%xmm0 movdqa %xmm6,%xmm7 movdqu 96(%edi),%xmm6 paddq %xmm5,%xmm0 -.byte 102,15,56,0,247 + pshufb %xmm7,%xmm6 movdqa %xmm0,-48(%edx) movdqa %xmm1,16(%edx) movdqa 96(%ebp),%xmm1 movdqa %xmm7,%xmm0 movdqu 112(%edi),%xmm7 paddq %xmm6,%xmm1 -.byte 102,15,56,0,248 + pshufb %xmm0,%xmm7 movdqa %xmm1,-32(%edx) movdqa %xmm2,32(%edx) movdqa 112(%ebp),%xmm2 @@ -481,9 +481,9 @@ .L00600_47_ssse3: movdqa %xmm5,%xmm3 movdqa %xmm2,%xmm1 -.byte 102,15,58,15,208,8 + palignr $8,%xmm0,%xmm2 movdqa %xmm4,(%edx) -.byte 102,15,58,15,220,8 + palignr $8,%xmm4,%xmm3 movdqa %xmm2,%xmm4 psrlq $7,%xmm2 paddq %xmm3,%xmm0 @@ -608,9 +608,9 @@ movdqa %xmm2,-128(%edx) movdqa %xmm6,%xmm4 movdqa %xmm3,%xmm2 -.byte 102,15,58,15,217,8 + palignr $8,%xmm1,%xmm3 movdqa %xmm5,16(%edx) -.byte 102,15,58,15,229,8 + palignr $8,%xmm5,%xmm4 movdqa %xmm3,%xmm5 psrlq $7,%xmm3 paddq %xmm4,%xmm1 @@ -735,9 +735,9 @@ movdqa %xmm3,-112(%edx) movdqa %xmm7,%xmm5 movdqa %xmm4,%xmm3 -.byte 102,15,58,15,226,8 + palignr $8,%xmm2,%xmm4 movdqa %xmm6,32(%edx) -.byte 102,15,58,15,238,8 + palignr $8,%xmm6,%xmm5 movdqa %xmm4,%xmm6 psrlq $7,%xmm4 paddq %xmm5,%xmm2 @@ -862,9 +862,9 @@ movdqa %xmm4,-96(%edx) movdqa %xmm0,%xmm6 movdqa %xmm5,%xmm4 -.byte 102,15,58,15,235,8 + palignr $8,%xmm3,%xmm5 movdqa %xmm7,48(%edx) -.byte 102,15,58,15,247,8 + palignr $8,%xmm7,%xmm6 movdqa %xmm5,%xmm7 psrlq $7,%xmm5 paddq %xmm6,%xmm3 @@ -989,9 +989,9 @@ movdqa %xmm5,-80(%edx) movdqa %xmm1,%xmm7 movdqa %xmm6,%xmm5 -.byte 102,15,58,15,244,8 + palignr $8,%xmm4,%xmm6 movdqa %xmm0,(%edx) -.byte 102,15,58,15,248,8 + palignr $8,%xmm0,%xmm7 movdqa %xmm6,%xmm0 psrlq $7,%xmm6 paddq %xmm7,%xmm4 @@ -1116,9 +1116,9 @@ movdqa %xmm6,-64(%edx) movdqa %xmm2,%xmm0 movdqa %xmm7,%xmm6 -.byte 102,15,58,15,253,8 + palignr $8,%xmm5,%xmm7 movdqa %xmm1,16(%edx) -.byte 102,15,58,15,193,8 + palignr $8,%xmm1,%xmm0 movdqa %xmm7,%xmm1 psrlq $7,%xmm7 paddq %xmm0,%xmm5 @@ -1243,9 +1243,9 @@ movdqa %xmm7,-48(%edx) movdqa %xmm3,%xmm1 movdqa %xmm0,%xmm7 -.byte 102,15,58,15,198,8 + palignr $8,%xmm6,%xmm0 movdqa %xmm2,32(%edx) -.byte 102,15,58,15,202,8 + palignr $8,%xmm2,%xmm1 movdqa %xmm0,%xmm2 psrlq $7,%xmm0 paddq %xmm1,%xmm6 @@ -1370,9 +1370,9 @@ movdqa %xmm0,-32(%edx) movdqa %xmm4,%xmm2 movdqa %xmm1,%xmm0 -.byte 102,15,58,15,207,8 + palignr $8,%xmm7,%xmm1 movdqa %xmm3,48(%edx) -.byte 102,15,58,15,211,8 + palignr $8,%xmm3,%xmm2 movdqa %xmm1,%xmm3 psrlq $7,%xmm1 paddq %xmm2,%xmm7 @@ -1501,12 +1501,12 @@ movdqa (%ebp),%xmm1 leal -640(%ebp),%ebp movdqu (%ebx),%xmm0 -.byte 102,15,56,0,193 + pshufb %xmm1,%xmm0 movdqa (%ebp),%xmm3 movdqa %xmm1,%xmm2 movdqu 16(%ebx),%xmm1 paddq %xmm0,%xmm3 -.byte 102,15,56,0,202 + pshufb %xmm2,%xmm1 movq %mm4,%mm1 movq -128(%edx),%mm7 pxor %mm6,%mm5 @@ -1604,7 +1604,7 @@ movdqa %xmm2,%xmm3 movdqu 32(%ebx),%xmm2 paddq %xmm1,%xmm4 -.byte 102,15,56,0,211 + pshufb %xmm3,%xmm2 movq %mm4,%mm1 movq -112(%edx),%mm7 pxor %mm6,%mm5 @@ -1702,7 +1702,7 @@ movdqa %xmm3,%xmm4 movdqu 48(%ebx),%xmm3 paddq %xmm2,%xmm5 -.byte 102,15,56,0,220 + pshufb %xmm4,%xmm3 movq %mm4,%mm1 movq -96(%edx),%mm7 pxor %mm6,%mm5 @@ -1800,7 +1800,7 @@ movdqa %xmm4,%xmm5 movdqu 64(%ebx),%xmm4 paddq %xmm3,%xmm6 -.byte 102,15,56,0,229 + pshufb %xmm5,%xmm4 movq %mm4,%mm1 movq -80(%edx),%mm7 pxor %mm6,%mm5 @@ -1898,7 +1898,7 @@ movdqa %xmm5,%xmm6 movdqu 80(%ebx),%xmm5 paddq %xmm4,%xmm7 -.byte 102,15,56,0,238 + pshufb %xmm6,%xmm5 movq %mm4,%mm1 movq -64(%edx),%mm7 pxor %mm6,%mm5 @@ -1997,7 +1997,7 @@ movdqa %xmm6,%xmm7 movdqu 96(%ebx),%xmm6 paddq %xmm5,%xmm0 -.byte 102,15,56,0,247 + pshufb %xmm7,%xmm6 movq %mm4,%mm1 movq -48(%edx),%mm7 pxor %mm6,%mm5 @@ -2096,7 +2096,7 @@ movdqa %xmm7,%xmm0 movdqu 112(%ebx),%xmm7 paddq %xmm6,%xmm1 -.byte 102,15,56,0,248 + pshufb %xmm0,%xmm7 movq %mm4,%mm1 movq -32(%edx),%mm7 pxor %mm6,%mm5
diff --git a/gen/bcm/sha512-586-win.asm b/gen/bcm/sha512-586-win.asm index 75129dc..2089cf8 100644 --- a/gen/bcm/sha512-586-win.asm +++ b/gen/bcm/sha512-586-win.asm
@@ -410,50 +410,50 @@ sub esp,256 movdqa xmm1,[640+ebp] movdqu xmm0,[edi] -db 102,15,56,0,193 + pshufb xmm0,xmm1 movdqa xmm3,[ebp] movdqa xmm2,xmm1 movdqu xmm1,[16+edi] paddq xmm3,xmm0 -db 102,15,56,0,202 + pshufb xmm1,xmm2 movdqa [edx-128],xmm3 movdqa xmm4,[16+ebp] movdqa xmm3,xmm2 movdqu xmm2,[32+edi] paddq xmm4,xmm1 -db 102,15,56,0,211 + pshufb xmm2,xmm3 movdqa [edx-112],xmm4 movdqa xmm5,[32+ebp] movdqa xmm4,xmm3 movdqu xmm3,[48+edi] paddq xmm5,xmm2 -db 102,15,56,0,220 + pshufb xmm3,xmm4 movdqa [edx-96],xmm5 movdqa xmm6,[48+ebp] movdqa xmm5,xmm4 movdqu xmm4,[64+edi] paddq xmm6,xmm3 -db 102,15,56,0,229 + pshufb xmm4,xmm5 movdqa [edx-80],xmm6 movdqa xmm7,[64+ebp] movdqa xmm6,xmm5 movdqu xmm5,[80+edi] paddq xmm7,xmm4 -db 102,15,56,0,238 + pshufb xmm5,xmm6 movdqa [edx-64],xmm7 movdqa [edx],xmm0 movdqa xmm0,[80+ebp] movdqa xmm7,xmm6 movdqu xmm6,[96+edi] paddq xmm0,xmm5 -db 102,15,56,0,247 + pshufb xmm6,xmm7 movdqa [edx-48],xmm0 movdqa [16+edx],xmm1 movdqa xmm1,[96+ebp] movdqa xmm0,xmm7 movdqu xmm7,[112+edi] paddq xmm1,xmm6 -db 102,15,56,0,248 + pshufb xmm7,xmm0 movdqa [edx-32],xmm1 movdqa [32+edx],xmm2 movdqa xmm2,[112+ebp] @@ -484,9 +484,9 @@ L$00600_47_ssse3: movdqa xmm3,xmm5 movdqa xmm1,xmm2 -db 102,15,58,15,208,8 + palignr xmm2,xmm0,8 movdqa [edx],xmm4 -db 102,15,58,15,220,8 + palignr xmm3,xmm4,8 movdqa xmm4,xmm2 psrlq xmm2,7 paddq xmm0,xmm3 @@ -611,9 +611,9 @@ movdqa [edx-128],xmm2 movdqa xmm4,xmm6 movdqa xmm2,xmm3 -db 102,15,58,15,217,8 + palignr xmm3,xmm1,8 movdqa [16+edx],xmm5 -db 102,15,58,15,229,8 + palignr xmm4,xmm5,8 movdqa xmm5,xmm3 psrlq xmm3,7 paddq xmm1,xmm4 @@ -738,9 +738,9 @@ movdqa [edx-112],xmm3 movdqa xmm5,xmm7 movdqa xmm3,xmm4 -db 102,15,58,15,226,8 + palignr xmm4,xmm2,8 movdqa [32+edx],xmm6 -db 102,15,58,15,238,8 + palignr xmm5,xmm6,8 movdqa xmm6,xmm4 psrlq xmm4,7 paddq xmm2,xmm5 @@ -865,9 +865,9 @@ movdqa [edx-96],xmm4 movdqa xmm6,xmm0 movdqa xmm4,xmm5 -db 102,15,58,15,235,8 + palignr xmm5,xmm3,8 movdqa [48+edx],xmm7 -db 102,15,58,15,247,8 + palignr xmm6,xmm7,8 movdqa xmm7,xmm5 psrlq xmm5,7 paddq xmm3,xmm6 @@ -992,9 +992,9 @@ movdqa [edx-80],xmm5 movdqa xmm7,xmm1 movdqa xmm5,xmm6 -db 102,15,58,15,244,8 + palignr xmm6,xmm4,8 movdqa [edx],xmm0 -db 102,15,58,15,248,8 + palignr xmm7,xmm0,8 movdqa xmm0,xmm6 psrlq xmm6,7 paddq xmm4,xmm7 @@ -1119,9 +1119,9 @@ movdqa [edx-64],xmm6 movdqa xmm0,xmm2 movdqa xmm6,xmm7 -db 102,15,58,15,253,8 + palignr xmm7,xmm5,8 movdqa [16+edx],xmm1 -db 102,15,58,15,193,8 + palignr xmm0,xmm1,8 movdqa xmm1,xmm7 psrlq xmm7,7 paddq xmm5,xmm0 @@ -1246,9 +1246,9 @@ movdqa [edx-48],xmm7 movdqa xmm1,xmm3 movdqa xmm7,xmm0 -db 102,15,58,15,198,8 + palignr xmm0,xmm6,8 movdqa [32+edx],xmm2 -db 102,15,58,15,202,8 + palignr xmm1,xmm2,8 movdqa xmm2,xmm0 psrlq xmm0,7 paddq xmm6,xmm1 @@ -1373,9 +1373,9 @@ movdqa [edx-32],xmm0 movdqa xmm2,xmm4 movdqa xmm0,xmm1 -db 102,15,58,15,207,8 + palignr xmm1,xmm7,8 movdqa [48+edx],xmm3 -db 102,15,58,15,211,8 + palignr xmm2,xmm3,8 movdqa xmm3,xmm1 psrlq xmm1,7 paddq xmm7,xmm2 @@ -1504,12 +1504,12 @@ movdqa xmm1,[ebp] lea ebp,[ebp-640] movdqu xmm0,[ebx] -db 102,15,56,0,193 + pshufb xmm0,xmm1 movdqa xmm3,[ebp] movdqa xmm2,xmm1 movdqu xmm1,[16+ebx] paddq xmm3,xmm0 -db 102,15,56,0,202 + pshufb xmm1,xmm2 movq mm1,mm4 movq mm7,[edx-128] pxor mm5,mm6 @@ -1607,7 +1607,7 @@ movdqa xmm3,xmm2 movdqu xmm2,[32+ebx] paddq xmm4,xmm1 -db 102,15,56,0,211 + pshufb xmm2,xmm3 movq mm1,mm4 movq mm7,[edx-112] pxor mm5,mm6 @@ -1705,7 +1705,7 @@ movdqa xmm4,xmm3 movdqu xmm3,[48+ebx] paddq xmm5,xmm2 -db 102,15,56,0,220 + pshufb xmm3,xmm4 movq mm1,mm4 movq mm7,[edx-96] pxor mm5,mm6 @@ -1803,7 +1803,7 @@ movdqa xmm5,xmm4 movdqu xmm4,[64+ebx] paddq xmm6,xmm3 -db 102,15,56,0,229 + pshufb xmm4,xmm5 movq mm1,mm4 movq mm7,[edx-80] pxor mm5,mm6 @@ -1901,7 +1901,7 @@ movdqa xmm6,xmm5 movdqu xmm5,[80+ebx] paddq xmm7,xmm4 -db 102,15,56,0,238 + pshufb xmm5,xmm6 movq mm1,mm4 movq mm7,[edx-64] pxor mm5,mm6 @@ -2000,7 +2000,7 @@ movdqa xmm7,xmm6 movdqu xmm6,[96+ebx] paddq xmm0,xmm5 -db 102,15,56,0,247 + pshufb xmm6,xmm7 movq mm1,mm4 movq mm7,[edx-48] pxor mm5,mm6 @@ -2099,7 +2099,7 @@ movdqa xmm0,xmm7 movdqu xmm7,[112+ebx] paddq xmm1,xmm6 -db 102,15,56,0,248 + pshufb xmm7,xmm0 movq mm1,mm4 movq mm7,[edx-32] pxor mm5,mm6
diff --git a/gen/bcm/vpaes-x86-apple.S b/gen/bcm/vpaes-x86-apple.S index 02d3787..b6717d5 100644 --- a/gen/bcm/vpaes-x86-apple.S +++ b/gen/bcm/vpaes-x86-apple.S
@@ -81,12 +81,12 @@ pandn %xmm0,%xmm1 pand %xmm6,%xmm0 movdqu (%edx),%xmm5 -.byte 102,15,56,0,208 + pshufb %xmm0,%xmm2 movdqa 16(%ebp),%xmm0 pxor %xmm5,%xmm2 psrld $4,%xmm1 addl $16,%edx -.byte 102,15,56,0,193 + pshufb %xmm1,%xmm0 leal 192(%ebp),%ebx pxor %xmm2,%xmm0 jmp L000enc_entry @@ -94,25 +94,25 @@ L001enc_loop: movdqa 32(%ebp),%xmm4 movdqa 48(%ebp),%xmm0 -.byte 102,15,56,0,226 -.byte 102,15,56,0,195 + pshufb %xmm2,%xmm4 + pshufb %xmm3,%xmm0 pxor %xmm5,%xmm4 movdqa 64(%ebp),%xmm5 pxor %xmm4,%xmm0 movdqa -64(%ebx,%ecx,1),%xmm1 -.byte 102,15,56,0,234 + pshufb %xmm2,%xmm5 movdqa 80(%ebp),%xmm2 movdqa (%ebx,%ecx,1),%xmm4 -.byte 102,15,56,0,211 + pshufb %xmm3,%xmm2 movdqa %xmm0,%xmm3 pxor %xmm5,%xmm2 -.byte 102,15,56,0,193 + pshufb %xmm1,%xmm0 addl $16,%edx pxor %xmm2,%xmm0 -.byte 102,15,56,0,220 + pshufb %xmm4,%xmm3 addl $16,%ecx pxor %xmm0,%xmm3 -.byte 102,15,56,0,193 + pshufb %xmm1,%xmm0 andl $48,%ecx subl $1,%eax pxor %xmm3,%xmm0 @@ -122,30 +122,30 @@ pandn %xmm0,%xmm1 psrld $4,%xmm1 pand %xmm6,%xmm0 -.byte 102,15,56,0,232 + pshufb %xmm0,%xmm5 movdqa %xmm7,%xmm3 pxor %xmm1,%xmm0 -.byte 102,15,56,0,217 + pshufb %xmm1,%xmm3 movdqa %xmm7,%xmm4 pxor %xmm5,%xmm3 -.byte 102,15,56,0,224 + pshufb %xmm0,%xmm4 movdqa %xmm7,%xmm2 pxor %xmm5,%xmm4 -.byte 102,15,56,0,211 + pshufb %xmm3,%xmm2 movdqa %xmm7,%xmm3 pxor %xmm0,%xmm2 -.byte 102,15,56,0,220 + pshufb %xmm4,%xmm3 movdqu (%edx),%xmm5 pxor %xmm1,%xmm3 jnz L001enc_loop movdqa 96(%ebp),%xmm4 movdqa 112(%ebp),%xmm0 -.byte 102,15,56,0,226 + pshufb %xmm2,%xmm4 pxor %xmm5,%xmm4 -.byte 102,15,56,0,195 + pshufb %xmm3,%xmm0 movdqa 64(%ebx,%ecx,1),%xmm1 pxor %xmm4,%xmm0 -.byte 102,15,56,0,193 + pshufb %xmm1,%xmm0 ret .private_extern __vpaes_decrypt_core .align 4 @@ -160,10 +160,10 @@ movdqu (%edx),%xmm5 shll $4,%ecx pand %xmm6,%xmm0 -.byte 102,15,56,0,208 + pshufb %xmm0,%xmm2 movdqa -48(%ebx),%xmm0 xorl $48,%ecx -.byte 102,15,56,0,193 + pshufb %xmm1,%xmm0 andl $48,%ecx pxor %xmm5,%xmm2 movdqa 176(%ebp),%xmm5 @@ -175,32 +175,32 @@ L003dec_loop: movdqa -32(%ebx),%xmm4 movdqa -16(%ebx),%xmm1 -.byte 102,15,56,0,226 -.byte 102,15,56,0,203 + pshufb %xmm2,%xmm4 + pshufb %xmm3,%xmm1 pxor %xmm4,%xmm0 movdqa (%ebx),%xmm4 pxor %xmm1,%xmm0 movdqa 16(%ebx),%xmm1 -.byte 102,15,56,0,226 -.byte 102,15,56,0,197 -.byte 102,15,56,0,203 + pshufb %xmm2,%xmm4 + pshufb %xmm5,%xmm0 + pshufb %xmm3,%xmm1 pxor %xmm4,%xmm0 movdqa 32(%ebx),%xmm4 pxor %xmm1,%xmm0 movdqa 48(%ebx),%xmm1 -.byte 102,15,56,0,226 -.byte 102,15,56,0,197 -.byte 102,15,56,0,203 + pshufb %xmm2,%xmm4 + pshufb %xmm5,%xmm0 + pshufb %xmm3,%xmm1 pxor %xmm4,%xmm0 movdqa 64(%ebx),%xmm4 pxor %xmm1,%xmm0 movdqa 80(%ebx),%xmm1 -.byte 102,15,56,0,226 -.byte 102,15,56,0,197 -.byte 102,15,56,0,203 + pshufb %xmm2,%xmm4 + pshufb %xmm5,%xmm0 + pshufb %xmm3,%xmm1 pxor %xmm4,%xmm0 addl $16,%edx -.byte 102,15,58,15,237,12 + palignr $12,%xmm5,%xmm5 pxor %xmm1,%xmm0 subl $1,%eax L002dec_entry: @@ -209,30 +209,30 @@ pandn %xmm0,%xmm1 pand %xmm6,%xmm0 psrld $4,%xmm1 -.byte 102,15,56,0,208 + pshufb %xmm0,%xmm2 movdqa %xmm7,%xmm3 pxor %xmm1,%xmm0 -.byte 102,15,56,0,217 + pshufb %xmm1,%xmm3 movdqa %xmm7,%xmm4 pxor %xmm2,%xmm3 -.byte 102,15,56,0,224 + pshufb %xmm0,%xmm4 pxor %xmm2,%xmm4 movdqa %xmm7,%xmm2 -.byte 102,15,56,0,211 + pshufb %xmm3,%xmm2 movdqa %xmm7,%xmm3 pxor %xmm0,%xmm2 -.byte 102,15,56,0,220 + pshufb %xmm4,%xmm3 movdqu (%edx),%xmm0 pxor %xmm1,%xmm3 jnz L003dec_loop movdqa 96(%ebx),%xmm4 -.byte 102,15,56,0,226 + pshufb %xmm2,%xmm4 pxor %xmm0,%xmm4 movdqa 112(%ebx),%xmm0 movdqa (%ecx),%xmm2 -.byte 102,15,56,0,195 + pshufb %xmm3,%xmm0 pxor %xmm4,%xmm0 -.byte 102,15,56,0,194 + pshufb %xmm2,%xmm0 ret .private_extern __vpaes_schedule_core .align 4 @@ -251,7 +251,7 @@ jmp L005schedule_go L004schedule_am_decrypting: movdqa 256(%ebp,%ecx,1),%xmm1 -.byte 102,15,56,0,217 + pshufb %xmm1,%xmm3 movdqu %xmm3,(%edx) xorl $48,%ecx L005schedule_go: @@ -276,7 +276,7 @@ movl $4,%eax L011loop_schedule_192: call __vpaes_schedule_round -.byte 102,15,58,15,198,8 + palignr $8,%xmm6,%xmm0 call __vpaes_schedule_mangle call __vpaes_schedule_192_smear call __vpaes_schedule_mangle @@ -310,7 +310,7 @@ testl %edi,%edi jnz L013schedule_mangle_last_dec movdqa 256(%ebp,%ecx,1),%xmm1 -.byte 102,15,56,0,193 + pshufb %xmm1,%xmm0 leal 352(%ebp),%ebx addl $32,%edx L013schedule_mangle_last_dec: @@ -343,11 +343,11 @@ __vpaes_schedule_round: movdqa 8(%esp),%xmm2 pxor %xmm1,%xmm1 -.byte 102,15,58,15,202,15 -.byte 102,15,58,15,210,15 + palignr $15,%xmm2,%xmm1 + palignr $15,%xmm2,%xmm2 pxor %xmm1,%xmm7 pshufd $255,%xmm0,%xmm0 -.byte 102,15,58,15,192,1 + palignr $1,%xmm0,%xmm0 movdqa %xmm2,8(%esp) L_vpaes_schedule_low_round: movdqa %xmm7,%xmm1 @@ -364,24 +364,24 @@ psrld $4,%xmm1 pand %xmm4,%xmm0 movdqa -32(%ebp),%xmm2 -.byte 102,15,56,0,208 + pshufb %xmm0,%xmm2 pxor %xmm1,%xmm0 movdqa %xmm5,%xmm3 -.byte 102,15,56,0,217 + pshufb %xmm1,%xmm3 pxor %xmm2,%xmm3 movdqa %xmm5,%xmm4 -.byte 102,15,56,0,224 + pshufb %xmm0,%xmm4 pxor %xmm2,%xmm4 movdqa %xmm5,%xmm2 -.byte 102,15,56,0,211 + pshufb %xmm3,%xmm2 pxor %xmm0,%xmm2 movdqa %xmm5,%xmm3 -.byte 102,15,56,0,220 + pshufb %xmm4,%xmm3 pxor %xmm1,%xmm3 movdqa 32(%ebp),%xmm4 -.byte 102,15,56,0,226 + pshufb %xmm2,%xmm4 movdqa 48(%ebp),%xmm0 -.byte 102,15,56,0,195 + pshufb %xmm3,%xmm0 pxor %xmm4,%xmm0 pxor %xmm7,%xmm0 movdqa %xmm0,%xmm7 @@ -395,9 +395,9 @@ psrld $4,%xmm1 pand %xmm2,%xmm0 movdqa (%ebx),%xmm2 -.byte 102,15,56,0,208 + pshufb %xmm0,%xmm2 movdqa 16(%ebx),%xmm0 -.byte 102,15,56,0,193 + pshufb %xmm1,%xmm0 pxor %xmm2,%xmm0 ret .private_extern __vpaes_schedule_mangle @@ -409,11 +409,11 @@ jnz L014schedule_mangle_dec addl $16,%edx pxor 336(%ebp),%xmm4 -.byte 102,15,56,0,229 + pshufb %xmm5,%xmm4 movdqa %xmm4,%xmm3 -.byte 102,15,56,0,229 + pshufb %xmm5,%xmm4 pxor %xmm4,%xmm3 -.byte 102,15,56,0,229 + pshufb %xmm5,%xmm4 pxor %xmm4,%xmm3 jmp L015schedule_mangle_both .align 4,0x90 @@ -425,35 +425,35 @@ psrld $4,%xmm1 pand %xmm2,%xmm4 movdqa (%esi),%xmm2 -.byte 102,15,56,0,212 + pshufb %xmm4,%xmm2 movdqa 16(%esi),%xmm3 -.byte 102,15,56,0,217 + pshufb %xmm1,%xmm3 pxor %xmm2,%xmm3 -.byte 102,15,56,0,221 + pshufb %xmm5,%xmm3 movdqa 32(%esi),%xmm2 -.byte 102,15,56,0,212 + pshufb %xmm4,%xmm2 pxor %xmm3,%xmm2 movdqa 48(%esi),%xmm3 -.byte 102,15,56,0,217 + pshufb %xmm1,%xmm3 pxor %xmm2,%xmm3 -.byte 102,15,56,0,221 + pshufb %xmm5,%xmm3 movdqa 64(%esi),%xmm2 -.byte 102,15,56,0,212 + pshufb %xmm4,%xmm2 pxor %xmm3,%xmm2 movdqa 80(%esi),%xmm3 -.byte 102,15,56,0,217 + pshufb %xmm1,%xmm3 pxor %xmm2,%xmm3 -.byte 102,15,56,0,221 + pshufb %xmm5,%xmm3 movdqa 96(%esi),%xmm2 -.byte 102,15,56,0,212 + pshufb %xmm4,%xmm2 pxor %xmm3,%xmm2 movdqa 112(%esi),%xmm3 -.byte 102,15,56,0,217 + pshufb %xmm1,%xmm3 pxor %xmm2,%xmm3 addl $-16,%edx L015schedule_mangle_both: movdqa 256(%ebp,%ecx,1),%xmm1 -.byte 102,15,56,0,217 + pshufb %xmm1,%xmm3 addl $-16,%ecx andl $48,%ecx movdqu %xmm3,(%edx)
diff --git a/gen/bcm/vpaes-x86-linux.S b/gen/bcm/vpaes-x86-linux.S index 31dc9a0..13da4aa 100644 --- a/gen/bcm/vpaes-x86-linux.S +++ b/gen/bcm/vpaes-x86-linux.S
@@ -84,12 +84,12 @@ pandn %xmm0,%xmm1 pand %xmm6,%xmm0 movdqu (%edx),%xmm5 -.byte 102,15,56,0,208 + pshufb %xmm0,%xmm2 movdqa 16(%ebp),%xmm0 pxor %xmm5,%xmm2 psrld $4,%xmm1 addl $16,%edx -.byte 102,15,56,0,193 + pshufb %xmm1,%xmm0 leal 192(%ebp),%ebx pxor %xmm2,%xmm0 jmp .L000enc_entry @@ -97,25 +97,25 @@ .L001enc_loop: movdqa 32(%ebp),%xmm4 movdqa 48(%ebp),%xmm0 -.byte 102,15,56,0,226 -.byte 102,15,56,0,195 + pshufb %xmm2,%xmm4 + pshufb %xmm3,%xmm0 pxor %xmm5,%xmm4 movdqa 64(%ebp),%xmm5 pxor %xmm4,%xmm0 movdqa -64(%ebx,%ecx,1),%xmm1 -.byte 102,15,56,0,234 + pshufb %xmm2,%xmm5 movdqa 80(%ebp),%xmm2 movdqa (%ebx,%ecx,1),%xmm4 -.byte 102,15,56,0,211 + pshufb %xmm3,%xmm2 movdqa %xmm0,%xmm3 pxor %xmm5,%xmm2 -.byte 102,15,56,0,193 + pshufb %xmm1,%xmm0 addl $16,%edx pxor %xmm2,%xmm0 -.byte 102,15,56,0,220 + pshufb %xmm4,%xmm3 addl $16,%ecx pxor %xmm0,%xmm3 -.byte 102,15,56,0,193 + pshufb %xmm1,%xmm0 andl $48,%ecx subl $1,%eax pxor %xmm3,%xmm0 @@ -125,30 +125,30 @@ pandn %xmm0,%xmm1 psrld $4,%xmm1 pand %xmm6,%xmm0 -.byte 102,15,56,0,232 + pshufb %xmm0,%xmm5 movdqa %xmm7,%xmm3 pxor %xmm1,%xmm0 -.byte 102,15,56,0,217 + pshufb %xmm1,%xmm3 movdqa %xmm7,%xmm4 pxor %xmm5,%xmm3 -.byte 102,15,56,0,224 + pshufb %xmm0,%xmm4 movdqa %xmm7,%xmm2 pxor %xmm5,%xmm4 -.byte 102,15,56,0,211 + pshufb %xmm3,%xmm2 movdqa %xmm7,%xmm3 pxor %xmm0,%xmm2 -.byte 102,15,56,0,220 + pshufb %xmm4,%xmm3 movdqu (%edx),%xmm5 pxor %xmm1,%xmm3 jnz .L001enc_loop movdqa 96(%ebp),%xmm4 movdqa 112(%ebp),%xmm0 -.byte 102,15,56,0,226 + pshufb %xmm2,%xmm4 pxor %xmm5,%xmm4 -.byte 102,15,56,0,195 + pshufb %xmm3,%xmm0 movdqa 64(%ebx,%ecx,1),%xmm1 pxor %xmm4,%xmm0 -.byte 102,15,56,0,193 + pshufb %xmm1,%xmm0 ret .size _vpaes_encrypt_core,.-_vpaes_encrypt_core .hidden _vpaes_decrypt_core @@ -165,10 +165,10 @@ movdqu (%edx),%xmm5 shll $4,%ecx pand %xmm6,%xmm0 -.byte 102,15,56,0,208 + pshufb %xmm0,%xmm2 movdqa -48(%ebx),%xmm0 xorl $48,%ecx -.byte 102,15,56,0,193 + pshufb %xmm1,%xmm0 andl $48,%ecx pxor %xmm5,%xmm2 movdqa 176(%ebp),%xmm5 @@ -180,32 +180,32 @@ .L003dec_loop: movdqa -32(%ebx),%xmm4 movdqa -16(%ebx),%xmm1 -.byte 102,15,56,0,226 -.byte 102,15,56,0,203 + pshufb %xmm2,%xmm4 + pshufb %xmm3,%xmm1 pxor %xmm4,%xmm0 movdqa (%ebx),%xmm4 pxor %xmm1,%xmm0 movdqa 16(%ebx),%xmm1 -.byte 102,15,56,0,226 -.byte 102,15,56,0,197 -.byte 102,15,56,0,203 + pshufb %xmm2,%xmm4 + pshufb %xmm5,%xmm0 + pshufb %xmm3,%xmm1 pxor %xmm4,%xmm0 movdqa 32(%ebx),%xmm4 pxor %xmm1,%xmm0 movdqa 48(%ebx),%xmm1 -.byte 102,15,56,0,226 -.byte 102,15,56,0,197 -.byte 102,15,56,0,203 + pshufb %xmm2,%xmm4 + pshufb %xmm5,%xmm0 + pshufb %xmm3,%xmm1 pxor %xmm4,%xmm0 movdqa 64(%ebx),%xmm4 pxor %xmm1,%xmm0 movdqa 80(%ebx),%xmm1 -.byte 102,15,56,0,226 -.byte 102,15,56,0,197 -.byte 102,15,56,0,203 + pshufb %xmm2,%xmm4 + pshufb %xmm5,%xmm0 + pshufb %xmm3,%xmm1 pxor %xmm4,%xmm0 addl $16,%edx -.byte 102,15,58,15,237,12 + palignr $12,%xmm5,%xmm5 pxor %xmm1,%xmm0 subl $1,%eax .L002dec_entry: @@ -214,30 +214,30 @@ pandn %xmm0,%xmm1 pand %xmm6,%xmm0 psrld $4,%xmm1 -.byte 102,15,56,0,208 + pshufb %xmm0,%xmm2 movdqa %xmm7,%xmm3 pxor %xmm1,%xmm0 -.byte 102,15,56,0,217 + pshufb %xmm1,%xmm3 movdqa %xmm7,%xmm4 pxor %xmm2,%xmm3 -.byte 102,15,56,0,224 + pshufb %xmm0,%xmm4 pxor %xmm2,%xmm4 movdqa %xmm7,%xmm2 -.byte 102,15,56,0,211 + pshufb %xmm3,%xmm2 movdqa %xmm7,%xmm3 pxor %xmm0,%xmm2 -.byte 102,15,56,0,220 + pshufb %xmm4,%xmm3 movdqu (%edx),%xmm0 pxor %xmm1,%xmm3 jnz .L003dec_loop movdqa 96(%ebx),%xmm4 -.byte 102,15,56,0,226 + pshufb %xmm2,%xmm4 pxor %xmm0,%xmm4 movdqa 112(%ebx),%xmm0 movdqa (%ecx),%xmm2 -.byte 102,15,56,0,195 + pshufb %xmm3,%xmm0 pxor %xmm4,%xmm0 -.byte 102,15,56,0,194 + pshufb %xmm2,%xmm0 ret .size _vpaes_decrypt_core,.-_vpaes_decrypt_core .hidden _vpaes_schedule_core @@ -258,7 +258,7 @@ jmp .L005schedule_go .L004schedule_am_decrypting: movdqa 256(%ebp,%ecx,1),%xmm1 -.byte 102,15,56,0,217 + pshufb %xmm1,%xmm3 movdqu %xmm3,(%edx) xorl $48,%ecx .L005schedule_go: @@ -283,7 +283,7 @@ movl $4,%eax .L011loop_schedule_192: call _vpaes_schedule_round -.byte 102,15,58,15,198,8 + palignr $8,%xmm6,%xmm0 call _vpaes_schedule_mangle call _vpaes_schedule_192_smear call _vpaes_schedule_mangle @@ -317,7 +317,7 @@ testl %edi,%edi jnz .L013schedule_mangle_last_dec movdqa 256(%ebp,%ecx,1),%xmm1 -.byte 102,15,56,0,193 + pshufb %xmm1,%xmm0 leal 352(%ebp),%ebx addl $32,%edx .L013schedule_mangle_last_dec: @@ -354,11 +354,11 @@ _vpaes_schedule_round: movdqa 8(%esp),%xmm2 pxor %xmm1,%xmm1 -.byte 102,15,58,15,202,15 -.byte 102,15,58,15,210,15 + palignr $15,%xmm2,%xmm1 + palignr $15,%xmm2,%xmm2 pxor %xmm1,%xmm7 pshufd $255,%xmm0,%xmm0 -.byte 102,15,58,15,192,1 + palignr $1,%xmm0,%xmm0 movdqa %xmm2,8(%esp) .L_vpaes_schedule_low_round: movdqa %xmm7,%xmm1 @@ -375,24 +375,24 @@ psrld $4,%xmm1 pand %xmm4,%xmm0 movdqa -32(%ebp),%xmm2 -.byte 102,15,56,0,208 + pshufb %xmm0,%xmm2 pxor %xmm1,%xmm0 movdqa %xmm5,%xmm3 -.byte 102,15,56,0,217 + pshufb %xmm1,%xmm3 pxor %xmm2,%xmm3 movdqa %xmm5,%xmm4 -.byte 102,15,56,0,224 + pshufb %xmm0,%xmm4 pxor %xmm2,%xmm4 movdqa %xmm5,%xmm2 -.byte 102,15,56,0,211 + pshufb %xmm3,%xmm2 pxor %xmm0,%xmm2 movdqa %xmm5,%xmm3 -.byte 102,15,56,0,220 + pshufb %xmm4,%xmm3 pxor %xmm1,%xmm3 movdqa 32(%ebp),%xmm4 -.byte 102,15,56,0,226 + pshufb %xmm2,%xmm4 movdqa 48(%ebp),%xmm0 -.byte 102,15,56,0,195 + pshufb %xmm3,%xmm0 pxor %xmm4,%xmm0 pxor %xmm7,%xmm0 movdqa %xmm0,%xmm7 @@ -408,9 +408,9 @@ psrld $4,%xmm1 pand %xmm2,%xmm0 movdqa (%ebx),%xmm2 -.byte 102,15,56,0,208 + pshufb %xmm0,%xmm2 movdqa 16(%ebx),%xmm0 -.byte 102,15,56,0,193 + pshufb %xmm1,%xmm0 pxor %xmm2,%xmm0 ret .size _vpaes_schedule_transform,.-_vpaes_schedule_transform @@ -424,11 +424,11 @@ jnz .L014schedule_mangle_dec addl $16,%edx pxor 336(%ebp),%xmm4 -.byte 102,15,56,0,229 + pshufb %xmm5,%xmm4 movdqa %xmm4,%xmm3 -.byte 102,15,56,0,229 + pshufb %xmm5,%xmm4 pxor %xmm4,%xmm3 -.byte 102,15,56,0,229 + pshufb %xmm5,%xmm4 pxor %xmm4,%xmm3 jmp .L015schedule_mangle_both .align 16 @@ -440,35 +440,35 @@ psrld $4,%xmm1 pand %xmm2,%xmm4 movdqa (%esi),%xmm2 -.byte 102,15,56,0,212 + pshufb %xmm4,%xmm2 movdqa 16(%esi),%xmm3 -.byte 102,15,56,0,217 + pshufb %xmm1,%xmm3 pxor %xmm2,%xmm3 -.byte 102,15,56,0,221 + pshufb %xmm5,%xmm3 movdqa 32(%esi),%xmm2 -.byte 102,15,56,0,212 + pshufb %xmm4,%xmm2 pxor %xmm3,%xmm2 movdqa 48(%esi),%xmm3 -.byte 102,15,56,0,217 + pshufb %xmm1,%xmm3 pxor %xmm2,%xmm3 -.byte 102,15,56,0,221 + pshufb %xmm5,%xmm3 movdqa 64(%esi),%xmm2 -.byte 102,15,56,0,212 + pshufb %xmm4,%xmm2 pxor %xmm3,%xmm2 movdqa 80(%esi),%xmm3 -.byte 102,15,56,0,217 + pshufb %xmm1,%xmm3 pxor %xmm2,%xmm3 -.byte 102,15,56,0,221 + pshufb %xmm5,%xmm3 movdqa 96(%esi),%xmm2 -.byte 102,15,56,0,212 + pshufb %xmm4,%xmm2 pxor %xmm3,%xmm2 movdqa 112(%esi),%xmm3 -.byte 102,15,56,0,217 + pshufb %xmm1,%xmm3 pxor %xmm2,%xmm3 addl $-16,%edx .L015schedule_mangle_both: movdqa 256(%ebp,%ecx,1),%xmm1 -.byte 102,15,56,0,217 + pshufb %xmm1,%xmm3 addl $-16,%ecx andl $48,%ecx movdqu %xmm3,(%edx)
diff --git a/gen/bcm/vpaes-x86-win.asm b/gen/bcm/vpaes-x86-win.asm index 137b31e..30ba96c 100644 --- a/gen/bcm/vpaes-x86-win.asm +++ b/gen/bcm/vpaes-x86-win.asm
@@ -88,12 +88,12 @@ pandn xmm1,xmm0 pand xmm0,xmm6 movdqu xmm5,[edx] -db 102,15,56,0,208 + pshufb xmm2,xmm0 movdqa xmm0,[16+ebp] pxor xmm2,xmm5 psrld xmm1,4 add edx,16 -db 102,15,56,0,193 + pshufb xmm0,xmm1 lea ebx,[192+ebp] pxor xmm0,xmm2 jmp NEAR L$000enc_entry @@ -101,25 +101,25 @@ L$001enc_loop: movdqa xmm4,[32+ebp] movdqa xmm0,[48+ebp] -db 102,15,56,0,226 -db 102,15,56,0,195 + pshufb xmm4,xmm2 + pshufb xmm0,xmm3 pxor xmm4,xmm5 movdqa xmm5,[64+ebp] pxor xmm0,xmm4 movdqa xmm1,[ecx*1+ebx-64] -db 102,15,56,0,234 + pshufb xmm5,xmm2 movdqa xmm2,[80+ebp] movdqa xmm4,[ecx*1+ebx] -db 102,15,56,0,211 + pshufb xmm2,xmm3 movdqa xmm3,xmm0 pxor xmm2,xmm5 -db 102,15,56,0,193 + pshufb xmm0,xmm1 add edx,16 pxor xmm0,xmm2 -db 102,15,56,0,220 + pshufb xmm3,xmm4 add ecx,16 pxor xmm3,xmm0 -db 102,15,56,0,193 + pshufb xmm0,xmm1 and ecx,48 sub eax,1 pxor xmm0,xmm3 @@ -129,30 +129,30 @@ pandn xmm1,xmm0 psrld xmm1,4 pand xmm0,xmm6 -db 102,15,56,0,232 + pshufb xmm5,xmm0 movdqa xmm3,xmm7 pxor xmm0,xmm1 -db 102,15,56,0,217 + pshufb xmm3,xmm1 movdqa xmm4,xmm7 pxor xmm3,xmm5 -db 102,15,56,0,224 + pshufb xmm4,xmm0 movdqa xmm2,xmm7 pxor xmm4,xmm5 -db 102,15,56,0,211 + pshufb xmm2,xmm3 movdqa xmm3,xmm7 pxor xmm2,xmm0 -db 102,15,56,0,220 + pshufb xmm3,xmm4 movdqu xmm5,[edx] pxor xmm3,xmm1 jnz NEAR L$001enc_loop movdqa xmm4,[96+ebp] movdqa xmm0,[112+ebp] -db 102,15,56,0,226 + pshufb xmm4,xmm2 pxor xmm4,xmm5 -db 102,15,56,0,195 + pshufb xmm0,xmm3 movdqa xmm1,[64+ecx*1+ebx] pxor xmm0,xmm4 -db 102,15,56,0,193 + pshufb xmm0,xmm1 ret align 16 __vpaes_decrypt_core: @@ -166,10 +166,10 @@ movdqu xmm5,[edx] shl ecx,4 pand xmm0,xmm6 -db 102,15,56,0,208 + pshufb xmm2,xmm0 movdqa xmm0,[ebx-48] xor ecx,48 -db 102,15,56,0,193 + pshufb xmm0,xmm1 and ecx,48 pxor xmm2,xmm5 movdqa xmm5,[176+ebp] @@ -181,32 +181,32 @@ L$003dec_loop: movdqa xmm4,[ebx-32] movdqa xmm1,[ebx-16] -db 102,15,56,0,226 -db 102,15,56,0,203 + pshufb xmm4,xmm2 + pshufb xmm1,xmm3 pxor xmm0,xmm4 movdqa xmm4,[ebx] pxor xmm0,xmm1 movdqa xmm1,[16+ebx] -db 102,15,56,0,226 -db 102,15,56,0,197 -db 102,15,56,0,203 + pshufb xmm4,xmm2 + pshufb xmm0,xmm5 + pshufb xmm1,xmm3 pxor xmm0,xmm4 movdqa xmm4,[32+ebx] pxor xmm0,xmm1 movdqa xmm1,[48+ebx] -db 102,15,56,0,226 -db 102,15,56,0,197 -db 102,15,56,0,203 + pshufb xmm4,xmm2 + pshufb xmm0,xmm5 + pshufb xmm1,xmm3 pxor xmm0,xmm4 movdqa xmm4,[64+ebx] pxor xmm0,xmm1 movdqa xmm1,[80+ebx] -db 102,15,56,0,226 -db 102,15,56,0,197 -db 102,15,56,0,203 + pshufb xmm4,xmm2 + pshufb xmm0,xmm5 + pshufb xmm1,xmm3 pxor xmm0,xmm4 add edx,16 -db 102,15,58,15,237,12 + palignr xmm5,xmm5,12 pxor xmm0,xmm1 sub eax,1 L$002dec_entry: @@ -215,30 +215,30 @@ pandn xmm1,xmm0 pand xmm0,xmm6 psrld xmm1,4 -db 102,15,56,0,208 + pshufb xmm2,xmm0 movdqa xmm3,xmm7 pxor xmm0,xmm1 -db 102,15,56,0,217 + pshufb xmm3,xmm1 movdqa xmm4,xmm7 pxor xmm3,xmm2 -db 102,15,56,0,224 + pshufb xmm4,xmm0 pxor xmm4,xmm2 movdqa xmm2,xmm7 -db 102,15,56,0,211 + pshufb xmm2,xmm3 movdqa xmm3,xmm7 pxor xmm2,xmm0 -db 102,15,56,0,220 + pshufb xmm3,xmm4 movdqu xmm0,[edx] pxor xmm3,xmm1 jnz NEAR L$003dec_loop movdqa xmm4,[96+ebx] -db 102,15,56,0,226 + pshufb xmm4,xmm2 pxor xmm4,xmm0 movdqa xmm0,[112+ebx] movdqa xmm2,[ecx] -db 102,15,56,0,195 + pshufb xmm0,xmm3 pxor xmm0,xmm4 -db 102,15,56,0,194 + pshufb xmm0,xmm2 ret align 16 __vpaes_schedule_core: @@ -256,7 +256,7 @@ jmp NEAR L$005schedule_go L$004schedule_am_decrypting: movdqa xmm1,[256+ecx*1+ebp] -db 102,15,56,0,217 + pshufb xmm3,xmm1 movdqu [edx],xmm3 xor ecx,48 L$005schedule_go: @@ -281,7 +281,7 @@ mov eax,4 L$011loop_schedule_192: call __vpaes_schedule_round -db 102,15,58,15,198,8 + palignr xmm0,xmm6,8 call __vpaes_schedule_mangle call __vpaes_schedule_192_smear call __vpaes_schedule_mangle @@ -315,7 +315,7 @@ test edi,edi jnz NEAR L$013schedule_mangle_last_dec movdqa xmm1,[256+ecx*1+ebp] -db 102,15,56,0,193 + pshufb xmm0,xmm1 lea ebx,[352+ebp] add edx,32 L$013schedule_mangle_last_dec: @@ -346,11 +346,11 @@ __vpaes_schedule_round: movdqa xmm2,[8+esp] pxor xmm1,xmm1 -db 102,15,58,15,202,15 -db 102,15,58,15,210,15 + palignr xmm1,xmm2,15 + palignr xmm2,xmm2,15 pxor xmm7,xmm1 pshufd xmm0,xmm0,255 -db 102,15,58,15,192,1 + palignr xmm0,xmm0,1 movdqa [8+esp],xmm2 L$_vpaes_schedule_low_round: movdqa xmm1,xmm7 @@ -367,24 +367,24 @@ psrld xmm1,4 pand xmm0,xmm4 movdqa xmm2,[ebp-32] -db 102,15,56,0,208 + pshufb xmm2,xmm0 pxor xmm0,xmm1 movdqa xmm3,xmm5 -db 102,15,56,0,217 + pshufb xmm3,xmm1 pxor xmm3,xmm2 movdqa xmm4,xmm5 -db 102,15,56,0,224 + pshufb xmm4,xmm0 pxor xmm4,xmm2 movdqa xmm2,xmm5 -db 102,15,56,0,211 + pshufb xmm2,xmm3 pxor xmm2,xmm0 movdqa xmm3,xmm5 -db 102,15,56,0,220 + pshufb xmm3,xmm4 pxor xmm3,xmm1 movdqa xmm4,[32+ebp] -db 102,15,56,0,226 + pshufb xmm4,xmm2 movdqa xmm0,[48+ebp] -db 102,15,56,0,195 + pshufb xmm0,xmm3 pxor xmm0,xmm4 pxor xmm0,xmm7 movdqa xmm7,xmm0 @@ -397,9 +397,9 @@ psrld xmm1,4 pand xmm0,xmm2 movdqa xmm2,[ebx] -db 102,15,56,0,208 + pshufb xmm2,xmm0 movdqa xmm0,[16+ebx] -db 102,15,56,0,193 + pshufb xmm0,xmm1 pxor xmm0,xmm2 ret align 16 @@ -410,11 +410,11 @@ jnz NEAR L$014schedule_mangle_dec add edx,16 pxor xmm4,[336+ebp] -db 102,15,56,0,229 + pshufb xmm4,xmm5 movdqa xmm3,xmm4 -db 102,15,56,0,229 + pshufb xmm4,xmm5 pxor xmm3,xmm4 -db 102,15,56,0,229 + pshufb xmm4,xmm5 pxor xmm3,xmm4 jmp NEAR L$015schedule_mangle_both align 16 @@ -426,35 +426,35 @@ psrld xmm1,4 pand xmm4,xmm2 movdqa xmm2,[esi] -db 102,15,56,0,212 + pshufb xmm2,xmm4 movdqa xmm3,[16+esi] -db 102,15,56,0,217 + pshufb xmm3,xmm1 pxor xmm3,xmm2 -db 102,15,56,0,221 + pshufb xmm3,xmm5 movdqa xmm2,[32+esi] -db 102,15,56,0,212 + pshufb xmm2,xmm4 pxor xmm2,xmm3 movdqa xmm3,[48+esi] -db 102,15,56,0,217 + pshufb xmm3,xmm1 pxor xmm3,xmm2 -db 102,15,56,0,221 + pshufb xmm3,xmm5 movdqa xmm2,[64+esi] -db 102,15,56,0,212 + pshufb xmm2,xmm4 pxor xmm2,xmm3 movdqa xmm3,[80+esi] -db 102,15,56,0,217 + pshufb xmm3,xmm1 pxor xmm3,xmm2 -db 102,15,56,0,221 + pshufb xmm3,xmm5 movdqa xmm2,[96+esi] -db 102,15,56,0,212 + pshufb xmm2,xmm4 pxor xmm2,xmm3 movdqa xmm3,[112+esi] -db 102,15,56,0,217 + pshufb xmm3,xmm1 pxor xmm3,xmm2 add edx,-16 L$015schedule_mangle_both: movdqa xmm1,[256+ecx*1+ebp] -db 102,15,56,0,217 + pshufb xmm3,xmm1 add ecx,-16 and ecx,48 movdqu [edx],xmm3
diff --git a/gen/crypto/chacha-x86-apple.S b/gen/crypto/chacha-x86-apple.S index 48293da..c03fb5b 100644 --- a/gen/crypto/chacha-x86-apple.S +++ b/gen/crypto/chacha-x86-apple.S
@@ -850,7 +850,7 @@ L010loop1x: paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 -.byte 102,15,56,0,222 + pshufb %xmm6,%xmm3 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm4 @@ -859,7 +859,7 @@ por %xmm4,%xmm1 paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 -.byte 102,15,56,0,223 + pshufb %xmm7,%xmm3 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm4 @@ -872,7 +872,7 @@ nop paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 -.byte 102,15,56,0,222 + pshufb %xmm6,%xmm3 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm4 @@ -881,7 +881,7 @@ por %xmm4,%xmm1 paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 -.byte 102,15,56,0,223 + pshufb %xmm7,%xmm3 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm4
diff --git a/gen/crypto/chacha-x86-linux.S b/gen/crypto/chacha-x86-linux.S index 566fbb4..9ad20a0 100644 --- a/gen/crypto/chacha-x86-linux.S +++ b/gen/crypto/chacha-x86-linux.S
@@ -853,7 +853,7 @@ .L010loop1x: paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 -.byte 102,15,56,0,222 + pshufb %xmm6,%xmm3 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm4 @@ -862,7 +862,7 @@ por %xmm4,%xmm1 paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 -.byte 102,15,56,0,223 + pshufb %xmm7,%xmm3 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm4 @@ -875,7 +875,7 @@ nop paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 -.byte 102,15,56,0,222 + pshufb %xmm6,%xmm3 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm4 @@ -884,7 +884,7 @@ por %xmm4,%xmm1 paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 -.byte 102,15,56,0,223 + pshufb %xmm7,%xmm3 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm4
diff --git a/gen/crypto/chacha-x86-win.asm b/gen/crypto/chacha-x86-win.asm index 1e64634..799a6aa 100644 --- a/gen/crypto/chacha-x86-win.asm +++ b/gen/crypto/chacha-x86-win.asm
@@ -856,7 +856,7 @@ L$010loop1x: paddd xmm0,xmm1 pxor xmm3,xmm0 -db 102,15,56,0,222 + pshufb xmm3,xmm6 paddd xmm2,xmm3 pxor xmm1,xmm2 movdqa xmm4,xmm1 @@ -865,7 +865,7 @@ por xmm1,xmm4 paddd xmm0,xmm1 pxor xmm3,xmm0 -db 102,15,56,0,223 + pshufb xmm3,xmm7 paddd xmm2,xmm3 pxor xmm1,xmm2 movdqa xmm4,xmm1 @@ -878,7 +878,7 @@ nop paddd xmm0,xmm1 pxor xmm3,xmm0 -db 102,15,56,0,222 + pshufb xmm3,xmm6 paddd xmm2,xmm3 pxor xmm1,xmm2 movdqa xmm4,xmm1 @@ -887,7 +887,7 @@ por xmm1,xmm4 paddd xmm0,xmm1 pxor xmm3,xmm0 -db 102,15,56,0,223 + pshufb xmm3,xmm7 paddd xmm2,xmm3 pxor xmm1,xmm2 movdqa xmm4,xmm1