Stop manually encoding various x86 extensions from perlasm

Confirmed with objdump -d that object files remain unchanged.

Bug: 478924351
Change-Id: I590f2a31ef32a79a5a06414a3a0675ecdb4cc9b3
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/87907
Reviewed-by: Lily Chen <chlily@google.com>
Auto-Submit: David Benjamin <davidben@google.com>
Presubmit-BoringSSL-Verified: boringssl-scoped@luci-project-accounts.iam.gserviceaccount.com <boringssl-scoped@luci-project-accounts.iam.gserviceaccount.com>
Commit-Queue: Lily Chen <chlily@google.com>
diff --git a/crypto/perlasm/x86asm.pl b/crypto/perlasm/x86asm.pl
index acb1e6f..80a097d 100644
--- a/crypto/perlasm/x86asm.pl
+++ b/crypto/perlasm/x86asm.pl
@@ -114,65 +114,6 @@
     {	&::generic("movq",@_);			}
 }
 
-# SSE>2 instructions
-my %regrm = (	"eax"=>0, "ecx"=>1, "edx"=>2, "ebx"=>3,
-		"esp"=>4, "ebp"=>5, "esi"=>6, "edi"=>7	);
-sub ::pextrd
-{ my($dst,$src,$imm)=@_;
-    if ("$dst:$src" =~ /(e[a-dsd][ixp]):xmm([0-7])/)
-    {	&::data_byte(0x66,0x0f,0x3a,0x16,0xc0|($2<<3)|$regrm{$1},$imm);	}
-    else
-    {	&::generic("pextrd",@_);		}
-}
-
-sub ::pinsrd
-{ my($dst,$src,$imm)=@_;
-    if ("$dst:$src" =~ /xmm([0-7]):(e[a-dsd][ixp])/)
-    {	&::data_byte(0x66,0x0f,0x3a,0x22,0xc0|($1<<3)|$regrm{$2},$imm);	}
-    else
-    {	&::generic("pinsrd",@_);		}
-}
-
-sub ::pshufb
-{ my($dst,$src)=@_;
-    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
-    {	&data_byte(0x66,0x0f,0x38,0x00,0xc0|($1<<3)|$2);	}
-    else
-    {	&::generic("pshufb",@_);		}
-}
-
-sub ::palignr
-{ my($dst,$src,$imm)=@_;
-    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
-    {	&::data_byte(0x66,0x0f,0x3a,0x0f,0xc0|($1<<3)|$2,$imm);	}
-    else
-    {	&::generic("palignr",@_);		}
-}
-
-sub ::pclmulqdq
-{ my($dst,$src,$imm)=@_;
-    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
-    {	&::data_byte(0x66,0x0f,0x3a,0x44,0xc0|($1<<3)|$2,$imm);	}
-    else
-    {	&::generic("pclmulqdq",@_);		}
-}
-
-sub ::rdrand
-{ my ($dst)=@_;
-    if ($dst =~ /(e[a-dsd][ixp])/)
-    {	&::data_byte(0x0f,0xc7,0xf0|$regrm{$dst});	}
-    else
-    {	&::generic("rdrand",@_);	}
-}
-
-sub ::rdseed
-{ my ($dst)=@_;
-    if ($dst =~ /(e[a-dsd][ixp])/)
-    {	&::data_byte(0x0f,0xc7,0xf8|$regrm{$dst});	}
-    else
-    {	&::generic("rdrand",@_);	}
-}
-
 sub rxb {
  local *opcode=shift;
  my ($dst,$src1,$src2,$rxb)=@_;
diff --git a/gen/bcm/aesni-x86-apple.S b/gen/bcm/aesni-x86-apple.S
index db13057..7454d23 100644
--- a/gen/bcm/aesni-x86-apple.S
+++ b/gen/bcm/aesni-x86-apple.S
@@ -638,7 +638,7 @@
 	movdqa	%xmm7,%xmm2
 	leal	32(%edx,%ecx,1),%edx
 	subl	%ecx,%ebx
-.byte	102,15,56,0,253
+	pshufb	%xmm5,%xmm7
 L031ccm64_enc_outer:
 	movups	(%ebp),%xmm0
 	movl	%ebx,%ecx
@@ -667,7 +667,7 @@
 	xorps	%xmm2,%xmm6
 	movdqa	%xmm7,%xmm2
 	movups	%xmm6,(%edi)
-.byte	102,15,56,0,213
+	pshufb	%xmm5,%xmm2
 	leal	16(%edi),%edi
 	jnz	L031ccm64_enc_outer
 	movl	48(%esp),%esp
@@ -722,7 +722,7 @@
 	movdqa	%xmm7,%xmm2
 	movl	%edx,%ebp
 	movl	%ecx,%ebx
-.byte	102,15,56,0,253
+	pshufb	%xmm5,%xmm7
 	movups	(%edx),%xmm0
 	movups	16(%edx),%xmm1
 	leal	32(%edx),%edx
@@ -749,7 +749,7 @@
 	movdqa	%xmm7,%xmm2
 	movups	%xmm6,(%edi)
 	leal	16(%edi),%edi
-.byte	102,15,56,0,213
+	pshufb	%xmm5,%xmm2
 	subl	$1,%eax
 	jz	L035ccm64_dec_break
 	movups	(%ebp),%xmm0
@@ -851,29 +851,29 @@
 	movl	%ecx,20(%esp)
 	movl	%ecx,24(%esp)
 	movl	%ebp,28(%esp)
-.byte	102,15,58,22,251,3
-.byte	102,15,58,34,253,3
+	pextrd	$3,%xmm7,%ebx
+	pinsrd	$3,%ebp,%xmm7
 	movl	240(%edx),%ecx
 	bswap	%ebx
 	pxor	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
 	movdqa	(%esp),%xmm2
-.byte	102,15,58,34,195,0
+	pinsrd	$0,%ebx,%xmm0
 	leal	3(%ebx),%ebp
-.byte	102,15,58,34,205,0
+	pinsrd	$0,%ebp,%xmm1
 	incl	%ebx
-.byte	102,15,58,34,195,1
+	pinsrd	$1,%ebx,%xmm0
 	incl	%ebp
-.byte	102,15,58,34,205,1
+	pinsrd	$1,%ebp,%xmm1
 	incl	%ebx
-.byte	102,15,58,34,195,2
+	pinsrd	$2,%ebx,%xmm0
 	incl	%ebp
-.byte	102,15,58,34,205,2
+	pinsrd	$2,%ebp,%xmm1
 	movdqa	%xmm0,48(%esp)
-.byte	102,15,56,0,194
+	pshufb	%xmm2,%xmm0
 	movdqu	(%edx),%xmm6
 	movdqa	%xmm1,64(%esp)
-.byte	102,15,56,0,202
+	pshufb	%xmm2,%xmm1
 	pshufd	$192,%xmm0,%xmm2
 	pshufd	$128,%xmm0,%xmm3
 	cmpl	$6,%eax
@@ -930,12 +930,12 @@
 	movups	80(%esi),%xmm3
 	leal	96(%esi),%esi
 	movdqa	%xmm0,48(%esp)
-.byte	102,15,56,0,194
+	pshufb	%xmm2,%xmm0
 	xorps	%xmm4,%xmm6
 	movups	%xmm5,48(%edi)
 	xorps	%xmm3,%xmm7
 	movdqa	%xmm1,64(%esp)
-.byte	102,15,56,0,202
+	pshufb	%xmm2,%xmm1
 	movups	%xmm6,64(%edi)
 	pshufd	$192,%xmm0,%xmm2
 	movups	%xmm7,80(%edi)
@@ -2309,7 +2309,7 @@
 	movdqa	%xmm0,%xmm2
 	movdqu	%xmm0,-16(%edx)
 L115loop_key128:
-.byte	102,15,56,0,197
+	pshufb	%xmm5,%xmm0
 	aesenclast	%xmm4,%xmm0
 	pslld	$1,%xmm4
 	leal	16(%edx),%edx
@@ -2326,7 +2326,7 @@
 	decl	%ecx
 	jnz	L115loop_key128
 	movdqa	48(%ebx),%xmm4
-.byte	102,15,56,0,197
+	pshufb	%xmm5,%xmm0
 	aesenclast	%xmm4,%xmm0
 	pslld	$1,%xmm4
 	movdqa	%xmm2,%xmm3
@@ -2339,7 +2339,7 @@
 	pxor	%xmm2,%xmm0
 	movdqu	%xmm0,(%edx)
 	movdqa	%xmm0,%xmm2
-.byte	102,15,56,0,197
+	pshufb	%xmm5,%xmm0
 	aesenclast	%xmm4,%xmm0
 	movdqa	%xmm2,%xmm3
 	pslldq	$4,%xmm2
@@ -2363,7 +2363,7 @@
 L117loop_key192:
 	movq	%xmm2,(%edx)
 	movdqa	%xmm2,%xmm1
-.byte	102,15,56,0,213
+	pshufb	%xmm5,%xmm2
 	aesenclast	%xmm4,%xmm2
 	pslld	$1,%xmm4
 	leal	24(%edx),%edx
@@ -2397,7 +2397,7 @@
 	movdqa	%xmm2,%xmm1
 	movdqu	%xmm2,-16(%edx)
 L118loop_key256:
-.byte	102,15,56,0,213
+	pshufb	%xmm5,%xmm2
 	aesenclast	%xmm4,%xmm2
 	movdqa	%xmm0,%xmm3
 	pslldq	$4,%xmm0
diff --git a/gen/bcm/aesni-x86-linux.S b/gen/bcm/aesni-x86-linux.S
index 6a7d93c..b6772c1 100644
--- a/gen/bcm/aesni-x86-linux.S
+++ b/gen/bcm/aesni-x86-linux.S
@@ -661,7 +661,7 @@
 	movdqa	%xmm7,%xmm2
 	leal	32(%edx,%ecx,1),%edx
 	subl	%ecx,%ebx
-.byte	102,15,56,0,253
+	pshufb	%xmm5,%xmm7
 .L031ccm64_enc_outer:
 	movups	(%ebp),%xmm0
 	movl	%ebx,%ecx
@@ -690,7 +690,7 @@
 	xorps	%xmm2,%xmm6
 	movdqa	%xmm7,%xmm2
 	movups	%xmm6,(%edi)
-.byte	102,15,56,0,213
+	pshufb	%xmm5,%xmm2
 	leal	16(%edi),%edi
 	jnz	.L031ccm64_enc_outer
 	movl	48(%esp),%esp
@@ -747,7 +747,7 @@
 	movdqa	%xmm7,%xmm2
 	movl	%edx,%ebp
 	movl	%ecx,%ebx
-.byte	102,15,56,0,253
+	pshufb	%xmm5,%xmm7
 	movups	(%edx),%xmm0
 	movups	16(%edx),%xmm1
 	leal	32(%edx),%edx
@@ -774,7 +774,7 @@
 	movdqa	%xmm7,%xmm2
 	movups	%xmm6,(%edi)
 	leal	16(%edi),%edi
-.byte	102,15,56,0,213
+	pshufb	%xmm5,%xmm2
 	subl	$1,%eax
 	jz	.L035ccm64_dec_break
 	movups	(%ebp),%xmm0
@@ -878,29 +878,29 @@
 	movl	%ecx,20(%esp)
 	movl	%ecx,24(%esp)
 	movl	%ebp,28(%esp)
-.byte	102,15,58,22,251,3
-.byte	102,15,58,34,253,3
+	pextrd	$3,%xmm7,%ebx
+	pinsrd	$3,%ebp,%xmm7
 	movl	240(%edx),%ecx
 	bswap	%ebx
 	pxor	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
 	movdqa	(%esp),%xmm2
-.byte	102,15,58,34,195,0
+	pinsrd	$0,%ebx,%xmm0
 	leal	3(%ebx),%ebp
-.byte	102,15,58,34,205,0
+	pinsrd	$0,%ebp,%xmm1
 	incl	%ebx
-.byte	102,15,58,34,195,1
+	pinsrd	$1,%ebx,%xmm0
 	incl	%ebp
-.byte	102,15,58,34,205,1
+	pinsrd	$1,%ebp,%xmm1
 	incl	%ebx
-.byte	102,15,58,34,195,2
+	pinsrd	$2,%ebx,%xmm0
 	incl	%ebp
-.byte	102,15,58,34,205,2
+	pinsrd	$2,%ebp,%xmm1
 	movdqa	%xmm0,48(%esp)
-.byte	102,15,56,0,194
+	pshufb	%xmm2,%xmm0
 	movdqu	(%edx),%xmm6
 	movdqa	%xmm1,64(%esp)
-.byte	102,15,56,0,202
+	pshufb	%xmm2,%xmm1
 	pshufd	$192,%xmm0,%xmm2
 	pshufd	$128,%xmm0,%xmm3
 	cmpl	$6,%eax
@@ -957,12 +957,12 @@
 	movups	80(%esi),%xmm3
 	leal	96(%esi),%esi
 	movdqa	%xmm0,48(%esp)
-.byte	102,15,56,0,194
+	pshufb	%xmm2,%xmm0
 	xorps	%xmm4,%xmm6
 	movups	%xmm5,48(%edi)
 	xorps	%xmm3,%xmm7
 	movdqa	%xmm1,64(%esp)
-.byte	102,15,56,0,202
+	pshufb	%xmm2,%xmm1
 	movups	%xmm6,64(%edi)
 	pshufd	$192,%xmm0,%xmm2
 	movups	%xmm7,80(%edi)
@@ -2346,7 +2346,7 @@
 	movdqa	%xmm0,%xmm2
 	movdqu	%xmm0,-16(%edx)
 .L115loop_key128:
-.byte	102,15,56,0,197
+	pshufb	%xmm5,%xmm0
 	aesenclast	%xmm4,%xmm0
 	pslld	$1,%xmm4
 	leal	16(%edx),%edx
@@ -2363,7 +2363,7 @@
 	decl	%ecx
 	jnz	.L115loop_key128
 	movdqa	48(%ebx),%xmm4
-.byte	102,15,56,0,197
+	pshufb	%xmm5,%xmm0
 	aesenclast	%xmm4,%xmm0
 	pslld	$1,%xmm4
 	movdqa	%xmm2,%xmm3
@@ -2376,7 +2376,7 @@
 	pxor	%xmm2,%xmm0
 	movdqu	%xmm0,(%edx)
 	movdqa	%xmm0,%xmm2
-.byte	102,15,56,0,197
+	pshufb	%xmm5,%xmm0
 	aesenclast	%xmm4,%xmm0
 	movdqa	%xmm2,%xmm3
 	pslldq	$4,%xmm2
@@ -2400,7 +2400,7 @@
 .L117loop_key192:
 	movq	%xmm2,(%edx)
 	movdqa	%xmm2,%xmm1
-.byte	102,15,56,0,213
+	pshufb	%xmm5,%xmm2
 	aesenclast	%xmm4,%xmm2
 	pslld	$1,%xmm4
 	leal	24(%edx),%edx
@@ -2434,7 +2434,7 @@
 	movdqa	%xmm2,%xmm1
 	movdqu	%xmm2,-16(%edx)
 .L118loop_key256:
-.byte	102,15,56,0,213
+	pshufb	%xmm5,%xmm2
 	aesenclast	%xmm4,%xmm2
 	movdqa	%xmm0,%xmm3
 	pslldq	$4,%xmm0
diff --git a/gen/bcm/aesni-x86-win.asm b/gen/bcm/aesni-x86-win.asm
index 4bb5431..660b772 100644
--- a/gen/bcm/aesni-x86-win.asm
+++ b/gen/bcm/aesni-x86-win.asm
@@ -635,7 +635,7 @@
 	movdqa	xmm2,xmm7
 	lea	edx,[32+ecx*1+edx]
 	sub	ebx,ecx
-db	102,15,56,0,253
+	pshufb	xmm7,xmm5
 L$031ccm64_enc_outer:
 	movups	xmm0,[ebp]
 	mov	ecx,ebx
@@ -664,7 +664,7 @@
 	xorps	xmm6,xmm2
 	movdqa	xmm2,xmm7
 	movups	[edi],xmm6
-db	102,15,56,0,213
+	pshufb	xmm2,xmm5
 	lea	edi,[16+edi]
 	jnz	NEAR L$031ccm64_enc_outer
 	mov	esp,DWORD [48+esp]
@@ -718,7 +718,7 @@
 	movdqa	xmm2,xmm7
 	mov	ebp,edx
 	mov	ebx,ecx
-db	102,15,56,0,253
+	pshufb	xmm7,xmm5
 	movups	xmm0,[edx]
 	movups	xmm1,[16+edx]
 	lea	edx,[32+edx]
@@ -745,7 +745,7 @@
 	movdqa	xmm2,xmm7
 	movups	[edi],xmm6
 	lea	edi,[16+edi]
-db	102,15,56,0,213
+	pshufb	xmm2,xmm5
 	sub	eax,1
 	jz	NEAR L$035ccm64_dec_break
 	movups	xmm0,[ebp]
@@ -846,29 +846,29 @@
 	mov	DWORD [20+esp],ecx
 	mov	DWORD [24+esp],ecx
 	mov	DWORD [28+esp],ebp
-db	102,15,58,22,251,3
-db	102,15,58,34,253,3
+	pextrd	ebx,xmm7,3
+	pinsrd	xmm7,ebp,3
 	mov	ecx,DWORD [240+edx]
 	bswap	ebx
 	pxor	xmm0,xmm0
 	pxor	xmm1,xmm1
 	movdqa	xmm2,[esp]
-db	102,15,58,34,195,0
+	pinsrd	xmm0,ebx,0
 	lea	ebp,[3+ebx]
-db	102,15,58,34,205,0
+	pinsrd	xmm1,ebp,0
 	inc	ebx
-db	102,15,58,34,195,1
+	pinsrd	xmm0,ebx,1
 	inc	ebp
-db	102,15,58,34,205,1
+	pinsrd	xmm1,ebp,1
 	inc	ebx
-db	102,15,58,34,195,2
+	pinsrd	xmm0,ebx,2
 	inc	ebp
-db	102,15,58,34,205,2
+	pinsrd	xmm1,ebp,2
 	movdqa	[48+esp],xmm0
-db	102,15,56,0,194
+	pshufb	xmm0,xmm2
 	movdqu	xmm6,[edx]
 	movdqa	[64+esp],xmm1
-db	102,15,56,0,202
+	pshufb	xmm1,xmm2
 	pshufd	xmm2,xmm0,192
 	pshufd	xmm3,xmm0,128
 	cmp	eax,6
@@ -925,12 +925,12 @@
 	movups	xmm3,[80+esi]
 	lea	esi,[96+esi]
 	movdqa	[48+esp],xmm0
-db	102,15,56,0,194
+	pshufb	xmm0,xmm2
 	xorps	xmm6,xmm4
 	movups	[48+edi],xmm5
 	xorps	xmm7,xmm3
 	movdqa	[64+esp],xmm1
-db	102,15,56,0,202
+	pshufb	xmm1,xmm2
 	movups	[64+edi],xmm6
 	pshufd	xmm2,xmm0,192
 	movups	[80+edi],xmm7
@@ -2299,7 +2299,7 @@
 	movdqa	xmm2,xmm0
 	movdqu	[edx-16],xmm0
 L$115loop_key128:
-db	102,15,56,0,197
+	pshufb	xmm0,xmm5
 	aesenclast	xmm0,xmm4
 	pslld	xmm4,1
 	lea	edx,[16+edx]
@@ -2316,7 +2316,7 @@
 	dec	ecx
 	jnz	NEAR L$115loop_key128
 	movdqa	xmm4,[48+ebx]
-db	102,15,56,0,197
+	pshufb	xmm0,xmm5
 	aesenclast	xmm0,xmm4
 	pslld	xmm4,1
 	movdqa	xmm3,xmm2
@@ -2329,7 +2329,7 @@
 	pxor	xmm0,xmm2
 	movdqu	[edx],xmm0
 	movdqa	xmm2,xmm0
-db	102,15,56,0,197
+	pshufb	xmm0,xmm5
 	aesenclast	xmm0,xmm4
 	movdqa	xmm3,xmm2
 	pslldq	xmm2,4
@@ -2353,7 +2353,7 @@
 L$117loop_key192:
 	movq	[edx],xmm2
 	movdqa	xmm1,xmm2
-db	102,15,56,0,213
+	pshufb	xmm2,xmm5
 	aesenclast	xmm2,xmm4
 	pslld	xmm4,1
 	lea	edx,[24+edx]
@@ -2387,7 +2387,7 @@
 	movdqa	xmm1,xmm2
 	movdqu	[edx-16],xmm2
 L$118loop_key256:
-db	102,15,56,0,213
+	pshufb	xmm2,xmm5
 	aesenclast	xmm2,xmm4
 	movdqa	xmm3,xmm0
 	pslldq	xmm0,4
diff --git a/gen/bcm/ghash-ssse3-x86-apple.S b/gen/bcm/ghash-ssse3-x86-apple.S
index 96cb86f..b58a4fa 100644
--- a/gen/bcm/ghash-ssse3-x86-apple.S
+++ b/gen/bcm/ghash-ssse3-x86-apple.S
@@ -22,7 +22,7 @@
 	popl	%eax
 	movdqa	Lreverse_bytes-L000pic_point(%eax),%xmm7
 	movdqa	Llow4_mask-L000pic_point(%eax),%xmm2
-.byte	102,15,56,0,199
+	pshufb	%xmm7,%xmm0
 	movdqa	%xmm2,%xmm1
 	pandn	%xmm0,%xmm1
 	psrld	$4,%xmm1
@@ -34,12 +34,12 @@
 	movdqu	(%esi),%xmm4
 	leal	16(%esi),%esi
 	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
+	palignr	$1,%xmm3,%xmm6
 	movdqa	%xmm6,%xmm3
 	psrldq	$1,%xmm2
 	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
+	pshufb	%xmm0,%xmm4
+	pshufb	%xmm1,%xmm5
 	pxor	%xmm5,%xmm2
 	movdqa	%xmm4,%xmm5
 	psllq	$60,%xmm5
@@ -65,12 +65,12 @@
 	movdqu	(%esi),%xmm4
 	leal	16(%esi),%esi
 	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
+	palignr	$1,%xmm3,%xmm6
 	movdqa	%xmm6,%xmm3
 	psrldq	$1,%xmm2
 	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
+	pshufb	%xmm0,%xmm4
+	pshufb	%xmm1,%xmm5
 	pxor	%xmm5,%xmm2
 	movdqa	%xmm4,%xmm5
 	psllq	$60,%xmm5
@@ -96,12 +96,12 @@
 	movdqu	(%esi),%xmm4
 	leal	16(%esi),%esi
 	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
+	palignr	$1,%xmm3,%xmm6
 	movdqa	%xmm6,%xmm3
 	psrldq	$1,%xmm2
 	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
+	pshufb	%xmm0,%xmm4
+	pshufb	%xmm1,%xmm5
 	pxor	%xmm5,%xmm2
 	movdqa	%xmm4,%xmm5
 	psllq	$60,%xmm5
@@ -122,7 +122,7 @@
 	psrlq	$5,%xmm3
 	pxor	%xmm3,%xmm2
 	pxor	%xmm3,%xmm3
-.byte	102,15,56,0,215
+	pshufb	%xmm7,%xmm2
 	movdqu	%xmm2,(%edi)
 	pxor	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
@@ -155,12 +155,12 @@
 	popl	%ebx
 	movdqa	Lreverse_bytes-L004pic_point(%ebx),%xmm7
 	andl	$-16,%ecx
-.byte	102,15,56,0,199
+	pshufb	%xmm7,%xmm0
 	pxor	%xmm3,%xmm3
 L005loop_ghash:
 	movdqa	Llow4_mask-L004pic_point(%ebx),%xmm2
 	movdqu	(%edx),%xmm1
-.byte	102,15,56,0,207
+	pshufb	%xmm7,%xmm1
 	pxor	%xmm1,%xmm0
 	movdqa	%xmm2,%xmm1
 	pandn	%xmm0,%xmm1
@@ -172,12 +172,12 @@
 	movdqu	(%esi),%xmm4
 	leal	16(%esi),%esi
 	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
+	palignr	$1,%xmm3,%xmm6
 	movdqa	%xmm6,%xmm3
 	psrldq	$1,%xmm2
 	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
+	pshufb	%xmm0,%xmm4
+	pshufb	%xmm1,%xmm5
 	pxor	%xmm5,%xmm2
 	movdqa	%xmm4,%xmm5
 	psllq	$60,%xmm5
@@ -203,12 +203,12 @@
 	movdqu	(%esi),%xmm4
 	leal	16(%esi),%esi
 	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
+	palignr	$1,%xmm3,%xmm6
 	movdqa	%xmm6,%xmm3
 	psrldq	$1,%xmm2
 	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
+	pshufb	%xmm0,%xmm4
+	pshufb	%xmm1,%xmm5
 	pxor	%xmm5,%xmm2
 	movdqa	%xmm4,%xmm5
 	psllq	$60,%xmm5
@@ -234,12 +234,12 @@
 	movdqu	(%esi),%xmm4
 	leal	16(%esi),%esi
 	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
+	palignr	$1,%xmm3,%xmm6
 	movdqa	%xmm6,%xmm3
 	psrldq	$1,%xmm2
 	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
+	pshufb	%xmm0,%xmm4
+	pshufb	%xmm1,%xmm5
 	pxor	%xmm5,%xmm2
 	movdqa	%xmm4,%xmm5
 	psllq	$60,%xmm5
@@ -265,7 +265,7 @@
 	leal	16(%edx),%edx
 	subl	$16,%ecx
 	jnz	L005loop_ghash
-.byte	102,15,56,0,199
+	pshufb	%xmm7,%xmm0
 	movdqu	%xmm0,(%edi)
 	pxor	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
diff --git a/gen/bcm/ghash-ssse3-x86-linux.S b/gen/bcm/ghash-ssse3-x86-linux.S
index 7fe65e7..7e8d7bb 100644
--- a/gen/bcm/ghash-ssse3-x86-linux.S
+++ b/gen/bcm/ghash-ssse3-x86-linux.S
@@ -23,7 +23,7 @@
 	popl	%eax
 	movdqa	.Lreverse_bytes-.L000pic_point(%eax),%xmm7
 	movdqa	.Llow4_mask-.L000pic_point(%eax),%xmm2
-.byte	102,15,56,0,199
+	pshufb	%xmm7,%xmm0
 	movdqa	%xmm2,%xmm1
 	pandn	%xmm0,%xmm1
 	psrld	$4,%xmm1
@@ -35,12 +35,12 @@
 	movdqu	(%esi),%xmm4
 	leal	16(%esi),%esi
 	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
+	palignr	$1,%xmm3,%xmm6
 	movdqa	%xmm6,%xmm3
 	psrldq	$1,%xmm2
 	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
+	pshufb	%xmm0,%xmm4
+	pshufb	%xmm1,%xmm5
 	pxor	%xmm5,%xmm2
 	movdqa	%xmm4,%xmm5
 	psllq	$60,%xmm5
@@ -66,12 +66,12 @@
 	movdqu	(%esi),%xmm4
 	leal	16(%esi),%esi
 	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
+	palignr	$1,%xmm3,%xmm6
 	movdqa	%xmm6,%xmm3
 	psrldq	$1,%xmm2
 	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
+	pshufb	%xmm0,%xmm4
+	pshufb	%xmm1,%xmm5
 	pxor	%xmm5,%xmm2
 	movdqa	%xmm4,%xmm5
 	psllq	$60,%xmm5
@@ -97,12 +97,12 @@
 	movdqu	(%esi),%xmm4
 	leal	16(%esi),%esi
 	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
+	palignr	$1,%xmm3,%xmm6
 	movdqa	%xmm6,%xmm3
 	psrldq	$1,%xmm2
 	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
+	pshufb	%xmm0,%xmm4
+	pshufb	%xmm1,%xmm5
 	pxor	%xmm5,%xmm2
 	movdqa	%xmm4,%xmm5
 	psllq	$60,%xmm5
@@ -123,7 +123,7 @@
 	psrlq	$5,%xmm3
 	pxor	%xmm3,%xmm2
 	pxor	%xmm3,%xmm3
-.byte	102,15,56,0,215
+	pshufb	%xmm7,%xmm2
 	movdqu	%xmm2,(%edi)
 	pxor	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
@@ -158,12 +158,12 @@
 	popl	%ebx
 	movdqa	.Lreverse_bytes-.L004pic_point(%ebx),%xmm7
 	andl	$-16,%ecx
-.byte	102,15,56,0,199
+	pshufb	%xmm7,%xmm0
 	pxor	%xmm3,%xmm3
 .L005loop_ghash:
 	movdqa	.Llow4_mask-.L004pic_point(%ebx),%xmm2
 	movdqu	(%edx),%xmm1
-.byte	102,15,56,0,207
+	pshufb	%xmm7,%xmm1
 	pxor	%xmm1,%xmm0
 	movdqa	%xmm2,%xmm1
 	pandn	%xmm0,%xmm1
@@ -175,12 +175,12 @@
 	movdqu	(%esi),%xmm4
 	leal	16(%esi),%esi
 	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
+	palignr	$1,%xmm3,%xmm6
 	movdqa	%xmm6,%xmm3
 	psrldq	$1,%xmm2
 	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
+	pshufb	%xmm0,%xmm4
+	pshufb	%xmm1,%xmm5
 	pxor	%xmm5,%xmm2
 	movdqa	%xmm4,%xmm5
 	psllq	$60,%xmm5
@@ -206,12 +206,12 @@
 	movdqu	(%esi),%xmm4
 	leal	16(%esi),%esi
 	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
+	palignr	$1,%xmm3,%xmm6
 	movdqa	%xmm6,%xmm3
 	psrldq	$1,%xmm2
 	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
+	pshufb	%xmm0,%xmm4
+	pshufb	%xmm1,%xmm5
 	pxor	%xmm5,%xmm2
 	movdqa	%xmm4,%xmm5
 	psllq	$60,%xmm5
@@ -237,12 +237,12 @@
 	movdqu	(%esi),%xmm4
 	leal	16(%esi),%esi
 	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
+	palignr	$1,%xmm3,%xmm6
 	movdqa	%xmm6,%xmm3
 	psrldq	$1,%xmm2
 	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
+	pshufb	%xmm0,%xmm4
+	pshufb	%xmm1,%xmm5
 	pxor	%xmm5,%xmm2
 	movdqa	%xmm4,%xmm5
 	psllq	$60,%xmm5
@@ -268,7 +268,7 @@
 	leal	16(%edx),%edx
 	subl	$16,%ecx
 	jnz	.L005loop_ghash
-.byte	102,15,56,0,199
+	pshufb	%xmm7,%xmm0
 	movdqu	%xmm0,(%edi)
 	pxor	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
diff --git a/gen/bcm/ghash-ssse3-x86-win.asm b/gen/bcm/ghash-ssse3-x86-win.asm
index 1eca092..387b4a1 100644
--- a/gen/bcm/ghash-ssse3-x86-win.asm
+++ b/gen/bcm/ghash-ssse3-x86-win.asm
@@ -29,7 +29,7 @@
 	pop	eax
 	movdqa	xmm7,[(L$reverse_bytes-L$000pic_point)+eax]
 	movdqa	xmm2,[(L$low4_mask-L$000pic_point)+eax]
-db	102,15,56,0,199
+	pshufb	xmm0,xmm7
 	movdqa	xmm1,xmm2
 	pandn	xmm1,xmm0
 	psrld	xmm1,4
@@ -41,12 +41,12 @@
 	movdqu	xmm4,[esi]
 	lea	esi,[16+esi]
 	movdqa	xmm6,xmm2
-db	102,15,58,15,243,1
+	palignr	xmm6,xmm3,1
 	movdqa	xmm3,xmm6
 	psrldq	xmm2,1
 	movdqa	xmm5,xmm4
-db	102,15,56,0,224
-db	102,15,56,0,233
+	pshufb	xmm4,xmm0
+	pshufb	xmm5,xmm1
 	pxor	xmm2,xmm5
 	movdqa	xmm5,xmm4
 	psllq	xmm5,60
@@ -72,12 +72,12 @@
 	movdqu	xmm4,[esi]
 	lea	esi,[16+esi]
 	movdqa	xmm6,xmm2
-db	102,15,58,15,243,1
+	palignr	xmm6,xmm3,1
 	movdqa	xmm3,xmm6
 	psrldq	xmm2,1
 	movdqa	xmm5,xmm4
-db	102,15,56,0,224
-db	102,15,56,0,233
+	pshufb	xmm4,xmm0
+	pshufb	xmm5,xmm1
 	pxor	xmm2,xmm5
 	movdqa	xmm5,xmm4
 	psllq	xmm5,60
@@ -103,12 +103,12 @@
 	movdqu	xmm4,[esi]
 	lea	esi,[16+esi]
 	movdqa	xmm6,xmm2
-db	102,15,58,15,243,1
+	palignr	xmm6,xmm3,1
 	movdqa	xmm3,xmm6
 	psrldq	xmm2,1
 	movdqa	xmm5,xmm4
-db	102,15,56,0,224
-db	102,15,56,0,233
+	pshufb	xmm4,xmm0
+	pshufb	xmm5,xmm1
 	pxor	xmm2,xmm5
 	movdqa	xmm5,xmm4
 	psllq	xmm5,60
@@ -129,7 +129,7 @@
 	psrlq	xmm3,5
 	pxor	xmm2,xmm3
 	pxor	xmm3,xmm3
-db	102,15,56,0,215
+	pshufb	xmm2,xmm7
 	movdqu	[edi],xmm2
 	pxor	xmm0,xmm0
 	pxor	xmm1,xmm1
@@ -161,12 +161,12 @@
 	pop	ebx
 	movdqa	xmm7,[(L$reverse_bytes-L$004pic_point)+ebx]
 	and	ecx,-16
-db	102,15,56,0,199
+	pshufb	xmm0,xmm7
 	pxor	xmm3,xmm3
 L$005loop_ghash:
 	movdqa	xmm2,[(L$low4_mask-L$004pic_point)+ebx]
 	movdqu	xmm1,[edx]
-db	102,15,56,0,207
+	pshufb	xmm1,xmm7
 	pxor	xmm0,xmm1
 	movdqa	xmm1,xmm2
 	pandn	xmm1,xmm0
@@ -178,12 +178,12 @@
 	movdqu	xmm4,[esi]
 	lea	esi,[16+esi]
 	movdqa	xmm6,xmm2
-db	102,15,58,15,243,1
+	palignr	xmm6,xmm3,1
 	movdqa	xmm3,xmm6
 	psrldq	xmm2,1
 	movdqa	xmm5,xmm4
-db	102,15,56,0,224
-db	102,15,56,0,233
+	pshufb	xmm4,xmm0
+	pshufb	xmm5,xmm1
 	pxor	xmm2,xmm5
 	movdqa	xmm5,xmm4
 	psllq	xmm5,60
@@ -209,12 +209,12 @@
 	movdqu	xmm4,[esi]
 	lea	esi,[16+esi]
 	movdqa	xmm6,xmm2
-db	102,15,58,15,243,1
+	palignr	xmm6,xmm3,1
 	movdqa	xmm3,xmm6
 	psrldq	xmm2,1
 	movdqa	xmm5,xmm4
-db	102,15,56,0,224
-db	102,15,56,0,233
+	pshufb	xmm4,xmm0
+	pshufb	xmm5,xmm1
 	pxor	xmm2,xmm5
 	movdqa	xmm5,xmm4
 	psllq	xmm5,60
@@ -240,12 +240,12 @@
 	movdqu	xmm4,[esi]
 	lea	esi,[16+esi]
 	movdqa	xmm6,xmm2
-db	102,15,58,15,243,1
+	palignr	xmm6,xmm3,1
 	movdqa	xmm3,xmm6
 	psrldq	xmm2,1
 	movdqa	xmm5,xmm4
-db	102,15,56,0,224
-db	102,15,56,0,233
+	pshufb	xmm4,xmm0
+	pshufb	xmm5,xmm1
 	pxor	xmm2,xmm5
 	movdqa	xmm5,xmm4
 	psllq	xmm5,60
@@ -271,7 +271,7 @@
 	lea	edx,[16+edx]
 	sub	ecx,16
 	jnz	NEAR L$005loop_ghash
-db	102,15,56,0,199
+	pshufb	xmm0,xmm7
 	movdqu	[edi],xmm0
 	pxor	xmm0,xmm0
 	pxor	xmm1,xmm1
diff --git a/gen/bcm/ghash-x86-apple.S b/gen/bcm/ghash-x86-apple.S
index a178b74..6221487 100644
--- a/gen/bcm/ghash-x86-apple.S
+++ b/gen/bcm/ghash-x86-apple.S
@@ -34,9 +34,9 @@
 	pshufd	$78,%xmm2,%xmm4
 	pxor	%xmm0,%xmm3
 	pxor	%xmm2,%xmm4
-.byte	102,15,58,68,194,0
-.byte	102,15,58,68,202,17
-.byte	102,15,58,68,220,0
+	pclmulqdq	$0,%xmm2,%xmm0
+	pclmulqdq	$17,%xmm2,%xmm1
+	pclmulqdq	$0,%xmm4,%xmm3
 	xorps	%xmm0,%xmm3
 	xorps	%xmm1,%xmm3
 	movdqa	%xmm3,%xmm4
@@ -70,7 +70,7 @@
 	movdqu	%xmm2,(%edx)
 	pxor	%xmm0,%xmm4
 	movdqu	%xmm0,16(%edx)
-.byte	102,15,58,15,227,8
+	palignr	$8,%xmm3,%xmm4
 	movdqu	%xmm4,32(%edx)
 	ret
 .globl	_gcm_gmult_clmul
@@ -87,14 +87,14 @@
 	movdqu	(%eax),%xmm0
 	movdqa	(%ecx),%xmm5
 	movups	(%edx),%xmm2
-.byte	102,15,56,0,197
+	pshufb	%xmm5,%xmm0
 	movups	32(%edx),%xmm4
 	movdqa	%xmm0,%xmm1
 	pshufd	$78,%xmm0,%xmm3
 	pxor	%xmm0,%xmm3
-.byte	102,15,58,68,194,0
-.byte	102,15,58,68,202,17
-.byte	102,15,58,68,220,0
+	pclmulqdq	$0,%xmm2,%xmm0
+	pclmulqdq	$17,%xmm2,%xmm1
+	pclmulqdq	$0,%xmm4,%xmm3
 	xorps	%xmm0,%xmm3
 	xorps	%xmm1,%xmm3
 	movdqa	%xmm3,%xmm4
@@ -122,7 +122,7 @@
 	pxor	%xmm4,%xmm0
 	psrlq	$1,%xmm0
 	pxor	%xmm1,%xmm0
-.byte	102,15,56,0,197
+	pshufb	%xmm5,%xmm0
 	movdqu	%xmm0,(%eax)
 	ret
 .globl	_gcm_ghash_clmul
@@ -145,22 +145,22 @@
 	movdqu	(%eax),%xmm0
 	movdqa	(%ecx),%xmm5
 	movdqu	(%edx),%xmm2
-.byte	102,15,56,0,197
+	pshufb	%xmm5,%xmm0
 	subl	$16,%ebx
 	jz	L003odd_tail
 	movdqu	(%esi),%xmm3
 	movdqu	16(%esi),%xmm6
-.byte	102,15,56,0,221
-.byte	102,15,56,0,245
+	pshufb	%xmm5,%xmm3
+	pshufb	%xmm5,%xmm6
 	movdqu	32(%edx),%xmm5
 	pxor	%xmm3,%xmm0
 	pshufd	$78,%xmm6,%xmm3
 	movdqa	%xmm6,%xmm7
 	pxor	%xmm6,%xmm3
 	leal	32(%esi),%esi
-.byte	102,15,58,68,242,0
-.byte	102,15,58,68,250,17
-.byte	102,15,58,68,221,0
+	pclmulqdq	$0,%xmm2,%xmm6
+	pclmulqdq	$17,%xmm2,%xmm7
+	pclmulqdq	$0,%xmm5,%xmm3
 	movups	16(%edx),%xmm2
 	nop
 	subl	$32,%ebx
@@ -172,9 +172,9 @@
 	movdqa	%xmm0,%xmm1
 	pxor	%xmm0,%xmm4
 	nop
-.byte	102,15,58,68,194,0
-.byte	102,15,58,68,202,17
-.byte	102,15,58,68,229,16
+	pclmulqdq	$0,%xmm2,%xmm0
+	pclmulqdq	$17,%xmm2,%xmm1
+	pclmulqdq	$16,%xmm5,%xmm4
 	movups	(%edx),%xmm2
 	xorps	%xmm6,%xmm0
 	movdqa	(%ecx),%xmm5
@@ -183,14 +183,14 @@
 	pxor	%xmm0,%xmm3
 	movdqu	16(%esi),%xmm6
 	pxor	%xmm1,%xmm3
-.byte	102,15,56,0,253
+	pshufb	%xmm5,%xmm7
 	pxor	%xmm3,%xmm4
 	movdqa	%xmm4,%xmm3
 	psrldq	$8,%xmm4
 	pslldq	$8,%xmm3
 	pxor	%xmm4,%xmm1
 	pxor	%xmm3,%xmm0
-.byte	102,15,56,0,245
+	pshufb	%xmm5,%xmm6
 	pxor	%xmm7,%xmm1
 	movdqa	%xmm6,%xmm7
 	movdqa	%xmm0,%xmm4
@@ -199,7 +199,7 @@
 	pxor	%xmm0,%xmm3
 	psllq	$1,%xmm0
 	pxor	%xmm3,%xmm0
-.byte	102,15,58,68,242,0
+	pclmulqdq	$0,%xmm2,%xmm6
 	movups	32(%edx),%xmm5
 	psllq	$57,%xmm0
 	movdqa	%xmm0,%xmm3
@@ -212,14 +212,14 @@
 	psrlq	$1,%xmm0
 	pxor	%xmm7,%xmm3
 	pxor	%xmm4,%xmm1
-.byte	102,15,58,68,250,17
+	pclmulqdq	$17,%xmm2,%xmm7
 	movups	16(%edx),%xmm2
 	pxor	%xmm0,%xmm4
 	psrlq	$5,%xmm0
 	pxor	%xmm4,%xmm0
 	psrlq	$1,%xmm0
 	pxor	%xmm1,%xmm0
-.byte	102,15,58,68,221,0
+	pclmulqdq	$0,%xmm5,%xmm3
 	leal	32(%esi),%esi
 	subl	$32,%ebx
 	ja	L005mod_loop
@@ -227,9 +227,9 @@
 	pshufd	$78,%xmm0,%xmm4
 	movdqa	%xmm0,%xmm1
 	pxor	%xmm0,%xmm4
-.byte	102,15,58,68,194,0
-.byte	102,15,58,68,202,17
-.byte	102,15,58,68,229,16
+	pclmulqdq	$0,%xmm2,%xmm0
+	pclmulqdq	$17,%xmm2,%xmm1
+	pclmulqdq	$16,%xmm5,%xmm4
 	movdqa	(%ecx),%xmm5
 	xorps	%xmm6,%xmm0
 	xorps	%xmm7,%xmm1
@@ -266,16 +266,16 @@
 	movups	(%edx),%xmm2
 L003odd_tail:
 	movdqu	(%esi),%xmm3
-.byte	102,15,56,0,221
+	pshufb	%xmm5,%xmm3
 	pxor	%xmm3,%xmm0
 	movdqa	%xmm0,%xmm1
 	pshufd	$78,%xmm0,%xmm3
 	pshufd	$78,%xmm2,%xmm4
 	pxor	%xmm0,%xmm3
 	pxor	%xmm2,%xmm4
-.byte	102,15,58,68,194,0
-.byte	102,15,58,68,202,17
-.byte	102,15,58,68,220,0
+	pclmulqdq	$0,%xmm2,%xmm0
+	pclmulqdq	$17,%xmm2,%xmm1
+	pclmulqdq	$0,%xmm4,%xmm3
 	xorps	%xmm0,%xmm3
 	xorps	%xmm1,%xmm3
 	movdqa	%xmm3,%xmm4
@@ -304,7 +304,7 @@
 	psrlq	$1,%xmm0
 	pxor	%xmm1,%xmm0
 L006done:
-.byte	102,15,56,0,197
+	pshufb	%xmm5,%xmm0
 	movdqu	%xmm0,(%eax)
 	popl	%edi
 	popl	%esi
diff --git a/gen/bcm/ghash-x86-linux.S b/gen/bcm/ghash-x86-linux.S
index c897efc..960eeff 100644
--- a/gen/bcm/ghash-x86-linux.S
+++ b/gen/bcm/ghash-x86-linux.S
@@ -35,9 +35,9 @@
 	pshufd	$78,%xmm2,%xmm4
 	pxor	%xmm0,%xmm3
 	pxor	%xmm2,%xmm4
-.byte	102,15,58,68,194,0
-.byte	102,15,58,68,202,17
-.byte	102,15,58,68,220,0
+	pclmulqdq	$0,%xmm2,%xmm0
+	pclmulqdq	$17,%xmm2,%xmm1
+	pclmulqdq	$0,%xmm4,%xmm3
 	xorps	%xmm0,%xmm3
 	xorps	%xmm1,%xmm3
 	movdqa	%xmm3,%xmm4
@@ -71,7 +71,7 @@
 	movdqu	%xmm2,(%edx)
 	pxor	%xmm0,%xmm4
 	movdqu	%xmm0,16(%edx)
-.byte	102,15,58,15,227,8
+	palignr	$8,%xmm3,%xmm4
 	movdqu	%xmm4,32(%edx)
 	ret
 .size	gcm_init_clmul,.-.L_gcm_init_clmul_begin
@@ -90,14 +90,14 @@
 	movdqu	(%eax),%xmm0
 	movdqa	(%ecx),%xmm5
 	movups	(%edx),%xmm2
-.byte	102,15,56,0,197
+	pshufb	%xmm5,%xmm0
 	movups	32(%edx),%xmm4
 	movdqa	%xmm0,%xmm1
 	pshufd	$78,%xmm0,%xmm3
 	pxor	%xmm0,%xmm3
-.byte	102,15,58,68,194,0
-.byte	102,15,58,68,202,17
-.byte	102,15,58,68,220,0
+	pclmulqdq	$0,%xmm2,%xmm0
+	pclmulqdq	$17,%xmm2,%xmm1
+	pclmulqdq	$0,%xmm4,%xmm3
 	xorps	%xmm0,%xmm3
 	xorps	%xmm1,%xmm3
 	movdqa	%xmm3,%xmm4
@@ -125,7 +125,7 @@
 	pxor	%xmm4,%xmm0
 	psrlq	$1,%xmm0
 	pxor	%xmm1,%xmm0
-.byte	102,15,56,0,197
+	pshufb	%xmm5,%xmm0
 	movdqu	%xmm0,(%eax)
 	ret
 .size	gcm_gmult_clmul,.-.L_gcm_gmult_clmul_begin
@@ -150,22 +150,22 @@
 	movdqu	(%eax),%xmm0
 	movdqa	(%ecx),%xmm5
 	movdqu	(%edx),%xmm2
-.byte	102,15,56,0,197
+	pshufb	%xmm5,%xmm0
 	subl	$16,%ebx
 	jz	.L003odd_tail
 	movdqu	(%esi),%xmm3
 	movdqu	16(%esi),%xmm6
-.byte	102,15,56,0,221
-.byte	102,15,56,0,245
+	pshufb	%xmm5,%xmm3
+	pshufb	%xmm5,%xmm6
 	movdqu	32(%edx),%xmm5
 	pxor	%xmm3,%xmm0
 	pshufd	$78,%xmm6,%xmm3
 	movdqa	%xmm6,%xmm7
 	pxor	%xmm6,%xmm3
 	leal	32(%esi),%esi
-.byte	102,15,58,68,242,0
-.byte	102,15,58,68,250,17
-.byte	102,15,58,68,221,0
+	pclmulqdq	$0,%xmm2,%xmm6
+	pclmulqdq	$17,%xmm2,%xmm7
+	pclmulqdq	$0,%xmm5,%xmm3
 	movups	16(%edx),%xmm2
 	nop
 	subl	$32,%ebx
@@ -177,9 +177,9 @@
 	movdqa	%xmm0,%xmm1
 	pxor	%xmm0,%xmm4
 	nop
-.byte	102,15,58,68,194,0
-.byte	102,15,58,68,202,17
-.byte	102,15,58,68,229,16
+	pclmulqdq	$0,%xmm2,%xmm0
+	pclmulqdq	$17,%xmm2,%xmm1
+	pclmulqdq	$16,%xmm5,%xmm4
 	movups	(%edx),%xmm2
 	xorps	%xmm6,%xmm0
 	movdqa	(%ecx),%xmm5
@@ -188,14 +188,14 @@
 	pxor	%xmm0,%xmm3
 	movdqu	16(%esi),%xmm6
 	pxor	%xmm1,%xmm3
-.byte	102,15,56,0,253
+	pshufb	%xmm5,%xmm7
 	pxor	%xmm3,%xmm4
 	movdqa	%xmm4,%xmm3
 	psrldq	$8,%xmm4
 	pslldq	$8,%xmm3
 	pxor	%xmm4,%xmm1
 	pxor	%xmm3,%xmm0
-.byte	102,15,56,0,245
+	pshufb	%xmm5,%xmm6
 	pxor	%xmm7,%xmm1
 	movdqa	%xmm6,%xmm7
 	movdqa	%xmm0,%xmm4
@@ -204,7 +204,7 @@
 	pxor	%xmm0,%xmm3
 	psllq	$1,%xmm0
 	pxor	%xmm3,%xmm0
-.byte	102,15,58,68,242,0
+	pclmulqdq	$0,%xmm2,%xmm6
 	movups	32(%edx),%xmm5
 	psllq	$57,%xmm0
 	movdqa	%xmm0,%xmm3
@@ -217,14 +217,14 @@
 	psrlq	$1,%xmm0
 	pxor	%xmm7,%xmm3
 	pxor	%xmm4,%xmm1
-.byte	102,15,58,68,250,17
+	pclmulqdq	$17,%xmm2,%xmm7
 	movups	16(%edx),%xmm2
 	pxor	%xmm0,%xmm4
 	psrlq	$5,%xmm0
 	pxor	%xmm4,%xmm0
 	psrlq	$1,%xmm0
 	pxor	%xmm1,%xmm0
-.byte	102,15,58,68,221,0
+	pclmulqdq	$0,%xmm5,%xmm3
 	leal	32(%esi),%esi
 	subl	$32,%ebx
 	ja	.L005mod_loop
@@ -232,9 +232,9 @@
 	pshufd	$78,%xmm0,%xmm4
 	movdqa	%xmm0,%xmm1
 	pxor	%xmm0,%xmm4
-.byte	102,15,58,68,194,0
-.byte	102,15,58,68,202,17
-.byte	102,15,58,68,229,16
+	pclmulqdq	$0,%xmm2,%xmm0
+	pclmulqdq	$17,%xmm2,%xmm1
+	pclmulqdq	$16,%xmm5,%xmm4
 	movdqa	(%ecx),%xmm5
 	xorps	%xmm6,%xmm0
 	xorps	%xmm7,%xmm1
@@ -271,16 +271,16 @@
 	movups	(%edx),%xmm2
 .L003odd_tail:
 	movdqu	(%esi),%xmm3
-.byte	102,15,56,0,221
+	pshufb	%xmm5,%xmm3
 	pxor	%xmm3,%xmm0
 	movdqa	%xmm0,%xmm1
 	pshufd	$78,%xmm0,%xmm3
 	pshufd	$78,%xmm2,%xmm4
 	pxor	%xmm0,%xmm3
 	pxor	%xmm2,%xmm4
-.byte	102,15,58,68,194,0
-.byte	102,15,58,68,202,17
-.byte	102,15,58,68,220,0
+	pclmulqdq	$0,%xmm2,%xmm0
+	pclmulqdq	$17,%xmm2,%xmm1
+	pclmulqdq	$0,%xmm4,%xmm3
 	xorps	%xmm0,%xmm3
 	xorps	%xmm1,%xmm3
 	movdqa	%xmm3,%xmm4
@@ -309,7 +309,7 @@
 	psrlq	$1,%xmm0
 	pxor	%xmm1,%xmm0
 .L006done:
-.byte	102,15,56,0,197
+	pshufb	%xmm5,%xmm0
 	movdqu	%xmm0,(%eax)
 	popl	%edi
 	popl	%esi
diff --git a/gen/bcm/ghash-x86-win.asm b/gen/bcm/ghash-x86-win.asm
index d982fd6..64e8332 100644
--- a/gen/bcm/ghash-x86-win.asm
+++ b/gen/bcm/ghash-x86-win.asm
@@ -41,9 +41,9 @@
 	pshufd	xmm4,xmm2,78
 	pxor	xmm3,xmm0
 	pxor	xmm4,xmm2
-db	102,15,58,68,194,0
-db	102,15,58,68,202,17
-db	102,15,58,68,220,0
+	pclmulqdq	xmm0,xmm2,0
+	pclmulqdq	xmm1,xmm2,17
+	pclmulqdq	xmm3,xmm4,0
 	xorps	xmm3,xmm0
 	xorps	xmm3,xmm1
 	movdqa	xmm4,xmm3
@@ -77,7 +77,7 @@
 	movdqu	[edx],xmm2
 	pxor	xmm4,xmm0
 	movdqu	[16+edx],xmm0
-db	102,15,58,15,227,8
+	palignr	xmm4,xmm3,8
 	movdqu	[32+edx],xmm4
 	ret
 global	_gcm_gmult_clmul
@@ -93,14 +93,14 @@
 	movdqu	xmm0,[eax]
 	movdqa	xmm5,[ecx]
 	movups	xmm2,[edx]
-db	102,15,56,0,197
+	pshufb	xmm0,xmm5
 	movups	xmm4,[32+edx]
 	movdqa	xmm1,xmm0
 	pshufd	xmm3,xmm0,78
 	pxor	xmm3,xmm0
-db	102,15,58,68,194,0
-db	102,15,58,68,202,17
-db	102,15,58,68,220,0
+	pclmulqdq	xmm0,xmm2,0
+	pclmulqdq	xmm1,xmm2,17
+	pclmulqdq	xmm3,xmm4,0
 	xorps	xmm3,xmm0
 	xorps	xmm3,xmm1
 	movdqa	xmm4,xmm3
@@ -128,7 +128,7 @@
 	pxor	xmm0,xmm4
 	psrlq	xmm0,1
 	pxor	xmm0,xmm1
-db	102,15,56,0,197
+	pshufb	xmm0,xmm5
 	movdqu	[eax],xmm0
 	ret
 global	_gcm_ghash_clmul
@@ -150,22 +150,22 @@
 	movdqu	xmm0,[eax]
 	movdqa	xmm5,[ecx]
 	movdqu	xmm2,[edx]
-db	102,15,56,0,197
+	pshufb	xmm0,xmm5
 	sub	ebx,16
 	jz	NEAR L$003odd_tail
 	movdqu	xmm3,[esi]
 	movdqu	xmm6,[16+esi]
-db	102,15,56,0,221
-db	102,15,56,0,245
+	pshufb	xmm3,xmm5
+	pshufb	xmm6,xmm5
 	movdqu	xmm5,[32+edx]
 	pxor	xmm0,xmm3
 	pshufd	xmm3,xmm6,78
 	movdqa	xmm7,xmm6
 	pxor	xmm3,xmm6
 	lea	esi,[32+esi]
-db	102,15,58,68,242,0
-db	102,15,58,68,250,17
-db	102,15,58,68,221,0
+	pclmulqdq	xmm6,xmm2,0
+	pclmulqdq	xmm7,xmm2,17
+	pclmulqdq	xmm3,xmm5,0
 	movups	xmm2,[16+edx]
 	nop
 	sub	ebx,32
@@ -177,9 +177,9 @@
 	movdqa	xmm1,xmm0
 	pxor	xmm4,xmm0
 	nop
-db	102,15,58,68,194,0
-db	102,15,58,68,202,17
-db	102,15,58,68,229,16
+	pclmulqdq	xmm0,xmm2,0
+	pclmulqdq	xmm1,xmm2,17
+	pclmulqdq	xmm4,xmm5,16
 	movups	xmm2,[edx]
 	xorps	xmm0,xmm6
 	movdqa	xmm5,[ecx]
@@ -188,14 +188,14 @@
 	pxor	xmm3,xmm0
 	movdqu	xmm6,[16+esi]
 	pxor	xmm3,xmm1
-db	102,15,56,0,253
+	pshufb	xmm7,xmm5
 	pxor	xmm4,xmm3
 	movdqa	xmm3,xmm4
 	psrldq	xmm4,8
 	pslldq	xmm3,8
 	pxor	xmm1,xmm4
 	pxor	xmm0,xmm3
-db	102,15,56,0,245
+	pshufb	xmm6,xmm5
 	pxor	xmm1,xmm7
 	movdqa	xmm7,xmm6
 	movdqa	xmm4,xmm0
@@ -204,7 +204,7 @@
 	pxor	xmm3,xmm0
 	psllq	xmm0,1
 	pxor	xmm0,xmm3
-db	102,15,58,68,242,0
+	pclmulqdq	xmm6,xmm2,0
 	movups	xmm5,[32+edx]
 	psllq	xmm0,57
 	movdqa	xmm3,xmm0
@@ -217,14 +217,14 @@
 	psrlq	xmm0,1
 	pxor	xmm3,xmm7
 	pxor	xmm1,xmm4
-db	102,15,58,68,250,17
+	pclmulqdq	xmm7,xmm2,17
 	movups	xmm2,[16+edx]
 	pxor	xmm4,xmm0
 	psrlq	xmm0,5
 	pxor	xmm0,xmm4
 	psrlq	xmm0,1
 	pxor	xmm0,xmm1
-db	102,15,58,68,221,0
+	pclmulqdq	xmm3,xmm5,0
 	lea	esi,[32+esi]
 	sub	ebx,32
 	ja	NEAR L$005mod_loop
@@ -232,9 +232,9 @@
 	pshufd	xmm4,xmm0,78
 	movdqa	xmm1,xmm0
 	pxor	xmm4,xmm0
-db	102,15,58,68,194,0
-db	102,15,58,68,202,17
-db	102,15,58,68,229,16
+	pclmulqdq	xmm0,xmm2,0
+	pclmulqdq	xmm1,xmm2,17
+	pclmulqdq	xmm4,xmm5,16
 	movdqa	xmm5,[ecx]
 	xorps	xmm0,xmm6
 	xorps	xmm1,xmm7
@@ -271,16 +271,16 @@
 	movups	xmm2,[edx]
 L$003odd_tail:
 	movdqu	xmm3,[esi]
-db	102,15,56,0,221
+	pshufb	xmm3,xmm5
 	pxor	xmm0,xmm3
 	movdqa	xmm1,xmm0
 	pshufd	xmm3,xmm0,78
 	pshufd	xmm4,xmm2,78
 	pxor	xmm3,xmm0
 	pxor	xmm4,xmm2
-db	102,15,58,68,194,0
-db	102,15,58,68,202,17
-db	102,15,58,68,220,0
+	pclmulqdq	xmm0,xmm2,0
+	pclmulqdq	xmm1,xmm2,17
+	pclmulqdq	xmm3,xmm4,0
 	xorps	xmm3,xmm0
 	xorps	xmm3,xmm1
 	movdqa	xmm4,xmm3
@@ -309,7 +309,7 @@
 	psrlq	xmm0,1
 	pxor	xmm0,xmm1
 L$006done:
-db	102,15,56,0,197
+	pshufb	xmm0,xmm5
 	movdqu	[eax],xmm0
 	pop	edi
 	pop	esi
diff --git a/gen/bcm/sha1-586-apple.S b/gen/bcm/sha1-586-apple.S
index f0ab02b..f2c45ec 100644
--- a/gen/bcm/sha1-586-apple.S
+++ b/gen/bcm/sha1-586-apple.S
@@ -1424,11 +1424,11 @@
 	movdqu	-48(%ebp),%xmm1
 	movdqu	-32(%ebp),%xmm2
 	movdqu	-16(%ebp),%xmm3
-.byte	102,15,56,0,198
-.byte	102,15,56,0,206
-.byte	102,15,56,0,214
+	pshufb	%xmm6,%xmm0
+	pshufb	%xmm6,%xmm1
+	pshufb	%xmm6,%xmm2
 	movdqa	%xmm7,96(%esp)
-.byte	102,15,56,0,222
+	pshufb	%xmm6,%xmm3
 	paddd	%xmm7,%xmm0
 	paddd	%xmm7,%xmm1
 	paddd	%xmm7,%xmm2
@@ -2355,7 +2355,7 @@
 	movdqu	32(%ebp),%xmm2
 	movdqu	48(%ebp),%xmm3
 	addl	$64,%ebp
-.byte	102,15,56,0,198
+	pshufb	%xmm6,%xmm0
 	movl	%ebp,196(%esp)
 	movdqa	%xmm7,96(%esp)
 	addl	16(%esp),%ebx
@@ -2365,7 +2365,7 @@
 	addl	%esi,%ebx
 	xorl	%edi,%ebp
 	rorl	$7,%edx
-.byte	102,15,56,0,206
+	pshufb	%xmm6,%xmm1
 	addl	%ecx,%ebx
 	addl	20(%esp),%eax
 	xorl	%edx,%ebp
@@ -2401,7 +2401,7 @@
 	addl	%esi,%ecx
 	xorl	%eax,%ebp
 	rorl	$7,%edi
-.byte	102,15,56,0,214
+	pshufb	%xmm6,%xmm2
 	addl	%edx,%ecx
 	addl	36(%esp),%ebx
 	xorl	%edi,%ebp
@@ -2437,7 +2437,7 @@
 	addl	%esi,%edx
 	xorl	%ebx,%ebp
 	rorl	$7,%eax
-.byte	102,15,56,0,222
+	pshufb	%xmm6,%xmm3
 	addl	%edi,%edx
 	addl	52(%esp),%ecx
 	xorl	%eax,%ebp
diff --git a/gen/bcm/sha1-586-linux.S b/gen/bcm/sha1-586-linux.S
index 0e5754f..3d8d213 100644
--- a/gen/bcm/sha1-586-linux.S
+++ b/gen/bcm/sha1-586-linux.S
@@ -1427,11 +1427,11 @@
 	movdqu	-48(%ebp),%xmm1
 	movdqu	-32(%ebp),%xmm2
 	movdqu	-16(%ebp),%xmm3
-.byte	102,15,56,0,198
-.byte	102,15,56,0,206
-.byte	102,15,56,0,214
+	pshufb	%xmm6,%xmm0
+	pshufb	%xmm6,%xmm1
+	pshufb	%xmm6,%xmm2
 	movdqa	%xmm7,96(%esp)
-.byte	102,15,56,0,222
+	pshufb	%xmm6,%xmm3
 	paddd	%xmm7,%xmm0
 	paddd	%xmm7,%xmm1
 	paddd	%xmm7,%xmm2
@@ -2358,7 +2358,7 @@
 	movdqu	32(%ebp),%xmm2
 	movdqu	48(%ebp),%xmm3
 	addl	$64,%ebp
-.byte	102,15,56,0,198
+	pshufb	%xmm6,%xmm0
 	movl	%ebp,196(%esp)
 	movdqa	%xmm7,96(%esp)
 	addl	16(%esp),%ebx
@@ -2368,7 +2368,7 @@
 	addl	%esi,%ebx
 	xorl	%edi,%ebp
 	rorl	$7,%edx
-.byte	102,15,56,0,206
+	pshufb	%xmm6,%xmm1
 	addl	%ecx,%ebx
 	addl	20(%esp),%eax
 	xorl	%edx,%ebp
@@ -2404,7 +2404,7 @@
 	addl	%esi,%ecx
 	xorl	%eax,%ebp
 	rorl	$7,%edi
-.byte	102,15,56,0,214
+	pshufb	%xmm6,%xmm2
 	addl	%edx,%ecx
 	addl	36(%esp),%ebx
 	xorl	%edi,%ebp
@@ -2440,7 +2440,7 @@
 	addl	%esi,%edx
 	xorl	%ebx,%ebp
 	rorl	$7,%eax
-.byte	102,15,56,0,222
+	pshufb	%xmm6,%xmm3
 	addl	%edi,%edx
 	addl	52(%esp),%ecx
 	xorl	%eax,%ebp
diff --git a/gen/bcm/sha1-586-win.asm b/gen/bcm/sha1-586-win.asm
index a4e4875..7f0b44b 100644
--- a/gen/bcm/sha1-586-win.asm
+++ b/gen/bcm/sha1-586-win.asm
@@ -1430,11 +1430,11 @@
 	movdqu	xmm1,[ebp-48]
 	movdqu	xmm2,[ebp-32]
 	movdqu	xmm3,[ebp-16]
-db	102,15,56,0,198
-db	102,15,56,0,206
-db	102,15,56,0,214
+	pshufb	xmm0,xmm6
+	pshufb	xmm1,xmm6
+	pshufb	xmm2,xmm6
 	movdqa	[96+esp],xmm7
-db	102,15,56,0,222
+	pshufb	xmm3,xmm6
 	paddd	xmm0,xmm7
 	paddd	xmm1,xmm7
 	paddd	xmm2,xmm7
@@ -2361,7 +2361,7 @@
 	movdqu	xmm2,[32+ebp]
 	movdqu	xmm3,[48+ebp]
 	add	ebp,64
-db	102,15,56,0,198
+	pshufb	xmm0,xmm6
 	mov	DWORD [196+esp],ebp
 	movdqa	[96+esp],xmm7
 	add	ebx,DWORD [16+esp]
@@ -2371,7 +2371,7 @@
 	add	ebx,esi
 	xor	ebp,edi
 	ror	edx,7
-db	102,15,56,0,206
+	pshufb	xmm1,xmm6
 	add	ebx,ecx
 	add	eax,DWORD [20+esp]
 	xor	ebp,edx
@@ -2407,7 +2407,7 @@
 	add	ecx,esi
 	xor	ebp,eax
 	ror	edi,7
-db	102,15,56,0,214
+	pshufb	xmm2,xmm6
 	add	ecx,edx
 	add	ebx,DWORD [36+esp]
 	xor	ebp,edi
@@ -2443,7 +2443,7 @@
 	add	edx,esi
 	xor	ebp,ebx
 	ror	eax,7
-db	102,15,56,0,222
+	pshufb	xmm3,xmm6
 	add	edx,edi
 	add	ecx,DWORD [52+esp]
 	xor	ebp,eax
diff --git a/gen/bcm/sha256-586-apple.S b/gen/bcm/sha256-586-apple.S
index 8e74e68..66107d6 100644
--- a/gen/bcm/sha256-586-apple.S
+++ b/gen/bcm/sha256-586-apple.S
@@ -3203,14 +3203,14 @@
 	movdqu	32(%edi),%xmm2
 	movdqu	48(%edi),%xmm3
 	addl	$64,%edi
-.byte	102,15,56,0,199
+	pshufb	%xmm7,%xmm0
 	movl	%edi,100(%esp)
-.byte	102,15,56,0,207
+	pshufb	%xmm7,%xmm1
 	movdqa	(%ebp),%xmm4
-.byte	102,15,56,0,215
+	pshufb	%xmm7,%xmm2
 	movdqa	16(%ebp),%xmm5
 	paddd	%xmm0,%xmm4
-.byte	102,15,56,0,223
+	pshufb	%xmm7,%xmm3
 	movdqa	32(%ebp),%xmm6
 	paddd	%xmm1,%xmm5
 	movdqa	48(%ebp),%xmm7
@@ -3231,11 +3231,11 @@
 	movdqa	%xmm3,%xmm7
 	xorl	%ecx,%edx
 	movl	24(%esp),%edi
-.byte	102,15,58,15,224,4
+	palignr	$4,%xmm0,%xmm4
 	xorl	%edi,%esi
 	rorl	$5,%edx
 	andl	%ecx,%esi
-.byte	102,15,58,15,250,4
+	palignr	$4,%xmm2,%xmm7
 	movl	%ecx,16(%esp)
 	xorl	%ecx,%edx
 	xorl	%esi,%edi
@@ -3391,11 +3391,11 @@
 	movdqa	%xmm0,%xmm7
 	xorl	%ecx,%edx
 	movl	8(%esp),%edi
-.byte	102,15,58,15,225,4
+	palignr	$4,%xmm1,%xmm4
 	xorl	%edi,%esi
 	rorl	$5,%edx
 	andl	%ecx,%esi
-.byte	102,15,58,15,251,4
+	palignr	$4,%xmm3,%xmm7
 	movl	%ecx,(%esp)
 	xorl	%ecx,%edx
 	xorl	%esi,%edi
@@ -3551,11 +3551,11 @@
 	movdqa	%xmm1,%xmm7
 	xorl	%ecx,%edx
 	movl	24(%esp),%edi
-.byte	102,15,58,15,226,4
+	palignr	$4,%xmm2,%xmm4
 	xorl	%edi,%esi
 	rorl	$5,%edx
 	andl	%ecx,%esi
-.byte	102,15,58,15,248,4
+	palignr	$4,%xmm0,%xmm7
 	movl	%ecx,16(%esp)
 	xorl	%ecx,%edx
 	xorl	%esi,%edi
@@ -3711,11 +3711,11 @@
 	movdqa	%xmm2,%xmm7
 	xorl	%ecx,%edx
 	movl	8(%esp),%edi
-.byte	102,15,58,15,227,4
+	palignr	$4,%xmm3,%xmm4
 	xorl	%edi,%esi
 	rorl	$5,%edx
 	andl	%ecx,%esi
-.byte	102,15,58,15,249,4
+	palignr	$4,%xmm1,%xmm7
 	movl	%ecx,(%esp)
 	xorl	%ecx,%edx
 	xorl	%esi,%edi
diff --git a/gen/bcm/sha256-586-linux.S b/gen/bcm/sha256-586-linux.S
index 41b3759..d409651 100644
--- a/gen/bcm/sha256-586-linux.S
+++ b/gen/bcm/sha256-586-linux.S
@@ -3206,14 +3206,14 @@
 	movdqu	32(%edi),%xmm2
 	movdqu	48(%edi),%xmm3
 	addl	$64,%edi
-.byte	102,15,56,0,199
+	pshufb	%xmm7,%xmm0
 	movl	%edi,100(%esp)
-.byte	102,15,56,0,207
+	pshufb	%xmm7,%xmm1
 	movdqa	(%ebp),%xmm4
-.byte	102,15,56,0,215
+	pshufb	%xmm7,%xmm2
 	movdqa	16(%ebp),%xmm5
 	paddd	%xmm0,%xmm4
-.byte	102,15,56,0,223
+	pshufb	%xmm7,%xmm3
 	movdqa	32(%ebp),%xmm6
 	paddd	%xmm1,%xmm5
 	movdqa	48(%ebp),%xmm7
@@ -3234,11 +3234,11 @@
 	movdqa	%xmm3,%xmm7
 	xorl	%ecx,%edx
 	movl	24(%esp),%edi
-.byte	102,15,58,15,224,4
+	palignr	$4,%xmm0,%xmm4
 	xorl	%edi,%esi
 	rorl	$5,%edx
 	andl	%ecx,%esi
-.byte	102,15,58,15,250,4
+	palignr	$4,%xmm2,%xmm7
 	movl	%ecx,16(%esp)
 	xorl	%ecx,%edx
 	xorl	%esi,%edi
@@ -3394,11 +3394,11 @@
 	movdqa	%xmm0,%xmm7
 	xorl	%ecx,%edx
 	movl	8(%esp),%edi
-.byte	102,15,58,15,225,4
+	palignr	$4,%xmm1,%xmm4
 	xorl	%edi,%esi
 	rorl	$5,%edx
 	andl	%ecx,%esi
-.byte	102,15,58,15,251,4
+	palignr	$4,%xmm3,%xmm7
 	movl	%ecx,(%esp)
 	xorl	%ecx,%edx
 	xorl	%esi,%edi
@@ -3554,11 +3554,11 @@
 	movdqa	%xmm1,%xmm7
 	xorl	%ecx,%edx
 	movl	24(%esp),%edi
-.byte	102,15,58,15,226,4
+	palignr	$4,%xmm2,%xmm4
 	xorl	%edi,%esi
 	rorl	$5,%edx
 	andl	%ecx,%esi
-.byte	102,15,58,15,248,4
+	palignr	$4,%xmm0,%xmm7
 	movl	%ecx,16(%esp)
 	xorl	%ecx,%edx
 	xorl	%esi,%edi
@@ -3714,11 +3714,11 @@
 	movdqa	%xmm2,%xmm7
 	xorl	%ecx,%edx
 	movl	8(%esp),%edi
-.byte	102,15,58,15,227,4
+	palignr	$4,%xmm3,%xmm4
 	xorl	%edi,%esi
 	rorl	$5,%edx
 	andl	%ecx,%esi
-.byte	102,15,58,15,249,4
+	palignr	$4,%xmm1,%xmm7
 	movl	%ecx,(%esp)
 	xorl	%ecx,%edx
 	xorl	%esi,%edi
diff --git a/gen/bcm/sha256-586-win.asm b/gen/bcm/sha256-586-win.asm
index 8878695..b6fed17 100644
--- a/gen/bcm/sha256-586-win.asm
+++ b/gen/bcm/sha256-586-win.asm
@@ -3209,14 +3209,14 @@
 	movdqu	xmm2,[32+edi]
 	movdqu	xmm3,[48+edi]
 	add	edi,64
-db	102,15,56,0,199
+	pshufb	xmm0,xmm7
 	mov	DWORD [100+esp],edi
-db	102,15,56,0,207
+	pshufb	xmm1,xmm7
 	movdqa	xmm4,[ebp]
-db	102,15,56,0,215
+	pshufb	xmm2,xmm7
 	movdqa	xmm5,[16+ebp]
 	paddd	xmm4,xmm0
-db	102,15,56,0,223
+	pshufb	xmm3,xmm7
 	movdqa	xmm6,[32+ebp]
 	paddd	xmm5,xmm1
 	movdqa	xmm7,[48+ebp]
@@ -3237,11 +3237,11 @@
 	movdqa	xmm7,xmm3
 	xor	edx,ecx
 	mov	edi,DWORD [24+esp]
-db	102,15,58,15,224,4
+	palignr	xmm4,xmm0,4
 	xor	esi,edi
 	ror	edx,5
 	and	esi,ecx
-db	102,15,58,15,250,4
+	palignr	xmm7,xmm2,4
 	mov	DWORD [16+esp],ecx
 	xor	edx,ecx
 	xor	edi,esi
@@ -3397,11 +3397,11 @@
 	movdqa	xmm7,xmm0
 	xor	edx,ecx
 	mov	edi,DWORD [8+esp]
-db	102,15,58,15,225,4
+	palignr	xmm4,xmm1,4
 	xor	esi,edi
 	ror	edx,5
 	and	esi,ecx
-db	102,15,58,15,251,4
+	palignr	xmm7,xmm3,4
 	mov	DWORD [esp],ecx
 	xor	edx,ecx
 	xor	edi,esi
@@ -3557,11 +3557,11 @@
 	movdqa	xmm7,xmm1
 	xor	edx,ecx
 	mov	edi,DWORD [24+esp]
-db	102,15,58,15,226,4
+	palignr	xmm4,xmm2,4
 	xor	esi,edi
 	ror	edx,5
 	and	esi,ecx
-db	102,15,58,15,248,4
+	palignr	xmm7,xmm0,4
 	mov	DWORD [16+esp],ecx
 	xor	edx,ecx
 	xor	edi,esi
@@ -3717,11 +3717,11 @@
 	movdqa	xmm7,xmm2
 	xor	edx,ecx
 	mov	edi,DWORD [8+esp]
-db	102,15,58,15,227,4
+	palignr	xmm4,xmm3,4
 	xor	esi,edi
 	ror	edx,5
 	and	esi,ecx
-db	102,15,58,15,249,4
+	palignr	xmm7,xmm1,4
 	mov	DWORD [esp],ecx
 	xor	edx,ecx
 	xor	edi,esi
diff --git a/gen/bcm/sha512-586-apple.S b/gen/bcm/sha512-586-apple.S
index 785eaf5..be41827 100644
--- a/gen/bcm/sha512-586-apple.S
+++ b/gen/bcm/sha512-586-apple.S
@@ -404,50 +404,50 @@
 	subl	$256,%esp
 	movdqa	640(%ebp),%xmm1
 	movdqu	(%edi),%xmm0
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	movdqa	(%ebp),%xmm3
 	movdqa	%xmm1,%xmm2
 	movdqu	16(%edi),%xmm1
 	paddq	%xmm0,%xmm3
-.byte	102,15,56,0,202
+	pshufb	%xmm2,%xmm1
 	movdqa	%xmm3,-128(%edx)
 	movdqa	16(%ebp),%xmm4
 	movdqa	%xmm2,%xmm3
 	movdqu	32(%edi),%xmm2
 	paddq	%xmm1,%xmm4
-.byte	102,15,56,0,211
+	pshufb	%xmm3,%xmm2
 	movdqa	%xmm4,-112(%edx)
 	movdqa	32(%ebp),%xmm5
 	movdqa	%xmm3,%xmm4
 	movdqu	48(%edi),%xmm3
 	paddq	%xmm2,%xmm5
-.byte	102,15,56,0,220
+	pshufb	%xmm4,%xmm3
 	movdqa	%xmm5,-96(%edx)
 	movdqa	48(%ebp),%xmm6
 	movdqa	%xmm4,%xmm5
 	movdqu	64(%edi),%xmm4
 	paddq	%xmm3,%xmm6
-.byte	102,15,56,0,229
+	pshufb	%xmm5,%xmm4
 	movdqa	%xmm6,-80(%edx)
 	movdqa	64(%ebp),%xmm7
 	movdqa	%xmm5,%xmm6
 	movdqu	80(%edi),%xmm5
 	paddq	%xmm4,%xmm7
-.byte	102,15,56,0,238
+	pshufb	%xmm6,%xmm5
 	movdqa	%xmm7,-64(%edx)
 	movdqa	%xmm0,(%edx)
 	movdqa	80(%ebp),%xmm0
 	movdqa	%xmm6,%xmm7
 	movdqu	96(%edi),%xmm6
 	paddq	%xmm5,%xmm0
-.byte	102,15,56,0,247
+	pshufb	%xmm7,%xmm6
 	movdqa	%xmm0,-48(%edx)
 	movdqa	%xmm1,16(%edx)
 	movdqa	96(%ebp),%xmm1
 	movdqa	%xmm7,%xmm0
 	movdqu	112(%edi),%xmm7
 	paddq	%xmm6,%xmm1
-.byte	102,15,56,0,248
+	pshufb	%xmm0,%xmm7
 	movdqa	%xmm1,-32(%edx)
 	movdqa	%xmm2,32(%edx)
 	movdqa	112(%ebp),%xmm2
@@ -478,9 +478,9 @@
 L00600_47_ssse3:
 	movdqa	%xmm5,%xmm3
 	movdqa	%xmm2,%xmm1
-.byte	102,15,58,15,208,8
+	palignr	$8,%xmm0,%xmm2
 	movdqa	%xmm4,(%edx)
-.byte	102,15,58,15,220,8
+	palignr	$8,%xmm4,%xmm3
 	movdqa	%xmm2,%xmm4
 	psrlq	$7,%xmm2
 	paddq	%xmm3,%xmm0
@@ -605,9 +605,9 @@
 	movdqa	%xmm2,-128(%edx)
 	movdqa	%xmm6,%xmm4
 	movdqa	%xmm3,%xmm2
-.byte	102,15,58,15,217,8
+	palignr	$8,%xmm1,%xmm3
 	movdqa	%xmm5,16(%edx)
-.byte	102,15,58,15,229,8
+	palignr	$8,%xmm5,%xmm4
 	movdqa	%xmm3,%xmm5
 	psrlq	$7,%xmm3
 	paddq	%xmm4,%xmm1
@@ -732,9 +732,9 @@
 	movdqa	%xmm3,-112(%edx)
 	movdqa	%xmm7,%xmm5
 	movdqa	%xmm4,%xmm3
-.byte	102,15,58,15,226,8
+	palignr	$8,%xmm2,%xmm4
 	movdqa	%xmm6,32(%edx)
-.byte	102,15,58,15,238,8
+	palignr	$8,%xmm6,%xmm5
 	movdqa	%xmm4,%xmm6
 	psrlq	$7,%xmm4
 	paddq	%xmm5,%xmm2
@@ -859,9 +859,9 @@
 	movdqa	%xmm4,-96(%edx)
 	movdqa	%xmm0,%xmm6
 	movdqa	%xmm5,%xmm4
-.byte	102,15,58,15,235,8
+	palignr	$8,%xmm3,%xmm5
 	movdqa	%xmm7,48(%edx)
-.byte	102,15,58,15,247,8
+	palignr	$8,%xmm7,%xmm6
 	movdqa	%xmm5,%xmm7
 	psrlq	$7,%xmm5
 	paddq	%xmm6,%xmm3
@@ -986,9 +986,9 @@
 	movdqa	%xmm5,-80(%edx)
 	movdqa	%xmm1,%xmm7
 	movdqa	%xmm6,%xmm5
-.byte	102,15,58,15,244,8
+	palignr	$8,%xmm4,%xmm6
 	movdqa	%xmm0,(%edx)
-.byte	102,15,58,15,248,8
+	palignr	$8,%xmm0,%xmm7
 	movdqa	%xmm6,%xmm0
 	psrlq	$7,%xmm6
 	paddq	%xmm7,%xmm4
@@ -1113,9 +1113,9 @@
 	movdqa	%xmm6,-64(%edx)
 	movdqa	%xmm2,%xmm0
 	movdqa	%xmm7,%xmm6
-.byte	102,15,58,15,253,8
+	palignr	$8,%xmm5,%xmm7
 	movdqa	%xmm1,16(%edx)
-.byte	102,15,58,15,193,8
+	palignr	$8,%xmm1,%xmm0
 	movdqa	%xmm7,%xmm1
 	psrlq	$7,%xmm7
 	paddq	%xmm0,%xmm5
@@ -1240,9 +1240,9 @@
 	movdqa	%xmm7,-48(%edx)
 	movdqa	%xmm3,%xmm1
 	movdqa	%xmm0,%xmm7
-.byte	102,15,58,15,198,8
+	palignr	$8,%xmm6,%xmm0
 	movdqa	%xmm2,32(%edx)
-.byte	102,15,58,15,202,8
+	palignr	$8,%xmm2,%xmm1
 	movdqa	%xmm0,%xmm2
 	psrlq	$7,%xmm0
 	paddq	%xmm1,%xmm6
@@ -1367,9 +1367,9 @@
 	movdqa	%xmm0,-32(%edx)
 	movdqa	%xmm4,%xmm2
 	movdqa	%xmm1,%xmm0
-.byte	102,15,58,15,207,8
+	palignr	$8,%xmm7,%xmm1
 	movdqa	%xmm3,48(%edx)
-.byte	102,15,58,15,211,8
+	palignr	$8,%xmm3,%xmm2
 	movdqa	%xmm1,%xmm3
 	psrlq	$7,%xmm1
 	paddq	%xmm2,%xmm7
@@ -1498,12 +1498,12 @@
 	movdqa	(%ebp),%xmm1
 	leal	-640(%ebp),%ebp
 	movdqu	(%ebx),%xmm0
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	movdqa	(%ebp),%xmm3
 	movdqa	%xmm1,%xmm2
 	movdqu	16(%ebx),%xmm1
 	paddq	%xmm0,%xmm3
-.byte	102,15,56,0,202
+	pshufb	%xmm2,%xmm1
 	movq	%mm4,%mm1
 	movq	-128(%edx),%mm7
 	pxor	%mm6,%mm5
@@ -1601,7 +1601,7 @@
 	movdqa	%xmm2,%xmm3
 	movdqu	32(%ebx),%xmm2
 	paddq	%xmm1,%xmm4
-.byte	102,15,56,0,211
+	pshufb	%xmm3,%xmm2
 	movq	%mm4,%mm1
 	movq	-112(%edx),%mm7
 	pxor	%mm6,%mm5
@@ -1699,7 +1699,7 @@
 	movdqa	%xmm3,%xmm4
 	movdqu	48(%ebx),%xmm3
 	paddq	%xmm2,%xmm5
-.byte	102,15,56,0,220
+	pshufb	%xmm4,%xmm3
 	movq	%mm4,%mm1
 	movq	-96(%edx),%mm7
 	pxor	%mm6,%mm5
@@ -1797,7 +1797,7 @@
 	movdqa	%xmm4,%xmm5
 	movdqu	64(%ebx),%xmm4
 	paddq	%xmm3,%xmm6
-.byte	102,15,56,0,229
+	pshufb	%xmm5,%xmm4
 	movq	%mm4,%mm1
 	movq	-80(%edx),%mm7
 	pxor	%mm6,%mm5
@@ -1895,7 +1895,7 @@
 	movdqa	%xmm5,%xmm6
 	movdqu	80(%ebx),%xmm5
 	paddq	%xmm4,%xmm7
-.byte	102,15,56,0,238
+	pshufb	%xmm6,%xmm5
 	movq	%mm4,%mm1
 	movq	-64(%edx),%mm7
 	pxor	%mm6,%mm5
@@ -1994,7 +1994,7 @@
 	movdqa	%xmm6,%xmm7
 	movdqu	96(%ebx),%xmm6
 	paddq	%xmm5,%xmm0
-.byte	102,15,56,0,247
+	pshufb	%xmm7,%xmm6
 	movq	%mm4,%mm1
 	movq	-48(%edx),%mm7
 	pxor	%mm6,%mm5
@@ -2093,7 +2093,7 @@
 	movdqa	%xmm7,%xmm0
 	movdqu	112(%ebx),%xmm7
 	paddq	%xmm6,%xmm1
-.byte	102,15,56,0,248
+	pshufb	%xmm0,%xmm7
 	movq	%mm4,%mm1
 	movq	-32(%edx),%mm7
 	pxor	%mm6,%mm5
diff --git a/gen/bcm/sha512-586-linux.S b/gen/bcm/sha512-586-linux.S
index e82bd00..ebeb87d 100644
--- a/gen/bcm/sha512-586-linux.S
+++ b/gen/bcm/sha512-586-linux.S
@@ -407,50 +407,50 @@
 	subl	$256,%esp
 	movdqa	640(%ebp),%xmm1
 	movdqu	(%edi),%xmm0
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	movdqa	(%ebp),%xmm3
 	movdqa	%xmm1,%xmm2
 	movdqu	16(%edi),%xmm1
 	paddq	%xmm0,%xmm3
-.byte	102,15,56,0,202
+	pshufb	%xmm2,%xmm1
 	movdqa	%xmm3,-128(%edx)
 	movdqa	16(%ebp),%xmm4
 	movdqa	%xmm2,%xmm3
 	movdqu	32(%edi),%xmm2
 	paddq	%xmm1,%xmm4
-.byte	102,15,56,0,211
+	pshufb	%xmm3,%xmm2
 	movdqa	%xmm4,-112(%edx)
 	movdqa	32(%ebp),%xmm5
 	movdqa	%xmm3,%xmm4
 	movdqu	48(%edi),%xmm3
 	paddq	%xmm2,%xmm5
-.byte	102,15,56,0,220
+	pshufb	%xmm4,%xmm3
 	movdqa	%xmm5,-96(%edx)
 	movdqa	48(%ebp),%xmm6
 	movdqa	%xmm4,%xmm5
 	movdqu	64(%edi),%xmm4
 	paddq	%xmm3,%xmm6
-.byte	102,15,56,0,229
+	pshufb	%xmm5,%xmm4
 	movdqa	%xmm6,-80(%edx)
 	movdqa	64(%ebp),%xmm7
 	movdqa	%xmm5,%xmm6
 	movdqu	80(%edi),%xmm5
 	paddq	%xmm4,%xmm7
-.byte	102,15,56,0,238
+	pshufb	%xmm6,%xmm5
 	movdqa	%xmm7,-64(%edx)
 	movdqa	%xmm0,(%edx)
 	movdqa	80(%ebp),%xmm0
 	movdqa	%xmm6,%xmm7
 	movdqu	96(%edi),%xmm6
 	paddq	%xmm5,%xmm0
-.byte	102,15,56,0,247
+	pshufb	%xmm7,%xmm6
 	movdqa	%xmm0,-48(%edx)
 	movdqa	%xmm1,16(%edx)
 	movdqa	96(%ebp),%xmm1
 	movdqa	%xmm7,%xmm0
 	movdqu	112(%edi),%xmm7
 	paddq	%xmm6,%xmm1
-.byte	102,15,56,0,248
+	pshufb	%xmm0,%xmm7
 	movdqa	%xmm1,-32(%edx)
 	movdqa	%xmm2,32(%edx)
 	movdqa	112(%ebp),%xmm2
@@ -481,9 +481,9 @@
 .L00600_47_ssse3:
 	movdqa	%xmm5,%xmm3
 	movdqa	%xmm2,%xmm1
-.byte	102,15,58,15,208,8
+	palignr	$8,%xmm0,%xmm2
 	movdqa	%xmm4,(%edx)
-.byte	102,15,58,15,220,8
+	palignr	$8,%xmm4,%xmm3
 	movdqa	%xmm2,%xmm4
 	psrlq	$7,%xmm2
 	paddq	%xmm3,%xmm0
@@ -608,9 +608,9 @@
 	movdqa	%xmm2,-128(%edx)
 	movdqa	%xmm6,%xmm4
 	movdqa	%xmm3,%xmm2
-.byte	102,15,58,15,217,8
+	palignr	$8,%xmm1,%xmm3
 	movdqa	%xmm5,16(%edx)
-.byte	102,15,58,15,229,8
+	palignr	$8,%xmm5,%xmm4
 	movdqa	%xmm3,%xmm5
 	psrlq	$7,%xmm3
 	paddq	%xmm4,%xmm1
@@ -735,9 +735,9 @@
 	movdqa	%xmm3,-112(%edx)
 	movdqa	%xmm7,%xmm5
 	movdqa	%xmm4,%xmm3
-.byte	102,15,58,15,226,8
+	palignr	$8,%xmm2,%xmm4
 	movdqa	%xmm6,32(%edx)
-.byte	102,15,58,15,238,8
+	palignr	$8,%xmm6,%xmm5
 	movdqa	%xmm4,%xmm6
 	psrlq	$7,%xmm4
 	paddq	%xmm5,%xmm2
@@ -862,9 +862,9 @@
 	movdqa	%xmm4,-96(%edx)
 	movdqa	%xmm0,%xmm6
 	movdqa	%xmm5,%xmm4
-.byte	102,15,58,15,235,8
+	palignr	$8,%xmm3,%xmm5
 	movdqa	%xmm7,48(%edx)
-.byte	102,15,58,15,247,8
+	palignr	$8,%xmm7,%xmm6
 	movdqa	%xmm5,%xmm7
 	psrlq	$7,%xmm5
 	paddq	%xmm6,%xmm3
@@ -989,9 +989,9 @@
 	movdqa	%xmm5,-80(%edx)
 	movdqa	%xmm1,%xmm7
 	movdqa	%xmm6,%xmm5
-.byte	102,15,58,15,244,8
+	palignr	$8,%xmm4,%xmm6
 	movdqa	%xmm0,(%edx)
-.byte	102,15,58,15,248,8
+	palignr	$8,%xmm0,%xmm7
 	movdqa	%xmm6,%xmm0
 	psrlq	$7,%xmm6
 	paddq	%xmm7,%xmm4
@@ -1116,9 +1116,9 @@
 	movdqa	%xmm6,-64(%edx)
 	movdqa	%xmm2,%xmm0
 	movdqa	%xmm7,%xmm6
-.byte	102,15,58,15,253,8
+	palignr	$8,%xmm5,%xmm7
 	movdqa	%xmm1,16(%edx)
-.byte	102,15,58,15,193,8
+	palignr	$8,%xmm1,%xmm0
 	movdqa	%xmm7,%xmm1
 	psrlq	$7,%xmm7
 	paddq	%xmm0,%xmm5
@@ -1243,9 +1243,9 @@
 	movdqa	%xmm7,-48(%edx)
 	movdqa	%xmm3,%xmm1
 	movdqa	%xmm0,%xmm7
-.byte	102,15,58,15,198,8
+	palignr	$8,%xmm6,%xmm0
 	movdqa	%xmm2,32(%edx)
-.byte	102,15,58,15,202,8
+	palignr	$8,%xmm2,%xmm1
 	movdqa	%xmm0,%xmm2
 	psrlq	$7,%xmm0
 	paddq	%xmm1,%xmm6
@@ -1370,9 +1370,9 @@
 	movdqa	%xmm0,-32(%edx)
 	movdqa	%xmm4,%xmm2
 	movdqa	%xmm1,%xmm0
-.byte	102,15,58,15,207,8
+	palignr	$8,%xmm7,%xmm1
 	movdqa	%xmm3,48(%edx)
-.byte	102,15,58,15,211,8
+	palignr	$8,%xmm3,%xmm2
 	movdqa	%xmm1,%xmm3
 	psrlq	$7,%xmm1
 	paddq	%xmm2,%xmm7
@@ -1501,12 +1501,12 @@
 	movdqa	(%ebp),%xmm1
 	leal	-640(%ebp),%ebp
 	movdqu	(%ebx),%xmm0
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	movdqa	(%ebp),%xmm3
 	movdqa	%xmm1,%xmm2
 	movdqu	16(%ebx),%xmm1
 	paddq	%xmm0,%xmm3
-.byte	102,15,56,0,202
+	pshufb	%xmm2,%xmm1
 	movq	%mm4,%mm1
 	movq	-128(%edx),%mm7
 	pxor	%mm6,%mm5
@@ -1604,7 +1604,7 @@
 	movdqa	%xmm2,%xmm3
 	movdqu	32(%ebx),%xmm2
 	paddq	%xmm1,%xmm4
-.byte	102,15,56,0,211
+	pshufb	%xmm3,%xmm2
 	movq	%mm4,%mm1
 	movq	-112(%edx),%mm7
 	pxor	%mm6,%mm5
@@ -1702,7 +1702,7 @@
 	movdqa	%xmm3,%xmm4
 	movdqu	48(%ebx),%xmm3
 	paddq	%xmm2,%xmm5
-.byte	102,15,56,0,220
+	pshufb	%xmm4,%xmm3
 	movq	%mm4,%mm1
 	movq	-96(%edx),%mm7
 	pxor	%mm6,%mm5
@@ -1800,7 +1800,7 @@
 	movdqa	%xmm4,%xmm5
 	movdqu	64(%ebx),%xmm4
 	paddq	%xmm3,%xmm6
-.byte	102,15,56,0,229
+	pshufb	%xmm5,%xmm4
 	movq	%mm4,%mm1
 	movq	-80(%edx),%mm7
 	pxor	%mm6,%mm5
@@ -1898,7 +1898,7 @@
 	movdqa	%xmm5,%xmm6
 	movdqu	80(%ebx),%xmm5
 	paddq	%xmm4,%xmm7
-.byte	102,15,56,0,238
+	pshufb	%xmm6,%xmm5
 	movq	%mm4,%mm1
 	movq	-64(%edx),%mm7
 	pxor	%mm6,%mm5
@@ -1997,7 +1997,7 @@
 	movdqa	%xmm6,%xmm7
 	movdqu	96(%ebx),%xmm6
 	paddq	%xmm5,%xmm0
-.byte	102,15,56,0,247
+	pshufb	%xmm7,%xmm6
 	movq	%mm4,%mm1
 	movq	-48(%edx),%mm7
 	pxor	%mm6,%mm5
@@ -2096,7 +2096,7 @@
 	movdqa	%xmm7,%xmm0
 	movdqu	112(%ebx),%xmm7
 	paddq	%xmm6,%xmm1
-.byte	102,15,56,0,248
+	pshufb	%xmm0,%xmm7
 	movq	%mm4,%mm1
 	movq	-32(%edx),%mm7
 	pxor	%mm6,%mm5
diff --git a/gen/bcm/sha512-586-win.asm b/gen/bcm/sha512-586-win.asm
index 75129dc..2089cf8 100644
--- a/gen/bcm/sha512-586-win.asm
+++ b/gen/bcm/sha512-586-win.asm
@@ -410,50 +410,50 @@
 	sub	esp,256
 	movdqa	xmm1,[640+ebp]
 	movdqu	xmm0,[edi]
-db	102,15,56,0,193
+	pshufb	xmm0,xmm1
 	movdqa	xmm3,[ebp]
 	movdqa	xmm2,xmm1
 	movdqu	xmm1,[16+edi]
 	paddq	xmm3,xmm0
-db	102,15,56,0,202
+	pshufb	xmm1,xmm2
 	movdqa	[edx-128],xmm3
 	movdqa	xmm4,[16+ebp]
 	movdqa	xmm3,xmm2
 	movdqu	xmm2,[32+edi]
 	paddq	xmm4,xmm1
-db	102,15,56,0,211
+	pshufb	xmm2,xmm3
 	movdqa	[edx-112],xmm4
 	movdqa	xmm5,[32+ebp]
 	movdqa	xmm4,xmm3
 	movdqu	xmm3,[48+edi]
 	paddq	xmm5,xmm2
-db	102,15,56,0,220
+	pshufb	xmm3,xmm4
 	movdqa	[edx-96],xmm5
 	movdqa	xmm6,[48+ebp]
 	movdqa	xmm5,xmm4
 	movdqu	xmm4,[64+edi]
 	paddq	xmm6,xmm3
-db	102,15,56,0,229
+	pshufb	xmm4,xmm5
 	movdqa	[edx-80],xmm6
 	movdqa	xmm7,[64+ebp]
 	movdqa	xmm6,xmm5
 	movdqu	xmm5,[80+edi]
 	paddq	xmm7,xmm4
-db	102,15,56,0,238
+	pshufb	xmm5,xmm6
 	movdqa	[edx-64],xmm7
 	movdqa	[edx],xmm0
 	movdqa	xmm0,[80+ebp]
 	movdqa	xmm7,xmm6
 	movdqu	xmm6,[96+edi]
 	paddq	xmm0,xmm5
-db	102,15,56,0,247
+	pshufb	xmm6,xmm7
 	movdqa	[edx-48],xmm0
 	movdqa	[16+edx],xmm1
 	movdqa	xmm1,[96+ebp]
 	movdqa	xmm0,xmm7
 	movdqu	xmm7,[112+edi]
 	paddq	xmm1,xmm6
-db	102,15,56,0,248
+	pshufb	xmm7,xmm0
 	movdqa	[edx-32],xmm1
 	movdqa	[32+edx],xmm2
 	movdqa	xmm2,[112+ebp]
@@ -484,9 +484,9 @@
 L$00600_47_ssse3:
 	movdqa	xmm3,xmm5
 	movdqa	xmm1,xmm2
-db	102,15,58,15,208,8
+	palignr	xmm2,xmm0,8
 	movdqa	[edx],xmm4
-db	102,15,58,15,220,8
+	palignr	xmm3,xmm4,8
 	movdqa	xmm4,xmm2
 	psrlq	xmm2,7
 	paddq	xmm0,xmm3
@@ -611,9 +611,9 @@
 	movdqa	[edx-128],xmm2
 	movdqa	xmm4,xmm6
 	movdqa	xmm2,xmm3
-db	102,15,58,15,217,8
+	palignr	xmm3,xmm1,8
 	movdqa	[16+edx],xmm5
-db	102,15,58,15,229,8
+	palignr	xmm4,xmm5,8
 	movdqa	xmm5,xmm3
 	psrlq	xmm3,7
 	paddq	xmm1,xmm4
@@ -738,9 +738,9 @@
 	movdqa	[edx-112],xmm3
 	movdqa	xmm5,xmm7
 	movdqa	xmm3,xmm4
-db	102,15,58,15,226,8
+	palignr	xmm4,xmm2,8
 	movdqa	[32+edx],xmm6
-db	102,15,58,15,238,8
+	palignr	xmm5,xmm6,8
 	movdqa	xmm6,xmm4
 	psrlq	xmm4,7
 	paddq	xmm2,xmm5
@@ -865,9 +865,9 @@
 	movdqa	[edx-96],xmm4
 	movdqa	xmm6,xmm0
 	movdqa	xmm4,xmm5
-db	102,15,58,15,235,8
+	palignr	xmm5,xmm3,8
 	movdqa	[48+edx],xmm7
-db	102,15,58,15,247,8
+	palignr	xmm6,xmm7,8
 	movdqa	xmm7,xmm5
 	psrlq	xmm5,7
 	paddq	xmm3,xmm6
@@ -992,9 +992,9 @@
 	movdqa	[edx-80],xmm5
 	movdqa	xmm7,xmm1
 	movdqa	xmm5,xmm6
-db	102,15,58,15,244,8
+	palignr	xmm6,xmm4,8
 	movdqa	[edx],xmm0
-db	102,15,58,15,248,8
+	palignr	xmm7,xmm0,8
 	movdqa	xmm0,xmm6
 	psrlq	xmm6,7
 	paddq	xmm4,xmm7
@@ -1119,9 +1119,9 @@
 	movdqa	[edx-64],xmm6
 	movdqa	xmm0,xmm2
 	movdqa	xmm6,xmm7
-db	102,15,58,15,253,8
+	palignr	xmm7,xmm5,8
 	movdqa	[16+edx],xmm1
-db	102,15,58,15,193,8
+	palignr	xmm0,xmm1,8
 	movdqa	xmm1,xmm7
 	psrlq	xmm7,7
 	paddq	xmm5,xmm0
@@ -1246,9 +1246,9 @@
 	movdqa	[edx-48],xmm7
 	movdqa	xmm1,xmm3
 	movdqa	xmm7,xmm0
-db	102,15,58,15,198,8
+	palignr	xmm0,xmm6,8
 	movdqa	[32+edx],xmm2
-db	102,15,58,15,202,8
+	palignr	xmm1,xmm2,8
 	movdqa	xmm2,xmm0
 	psrlq	xmm0,7
 	paddq	xmm6,xmm1
@@ -1373,9 +1373,9 @@
 	movdqa	[edx-32],xmm0
 	movdqa	xmm2,xmm4
 	movdqa	xmm0,xmm1
-db	102,15,58,15,207,8
+	palignr	xmm1,xmm7,8
 	movdqa	[48+edx],xmm3
-db	102,15,58,15,211,8
+	palignr	xmm2,xmm3,8
 	movdqa	xmm3,xmm1
 	psrlq	xmm1,7
 	paddq	xmm7,xmm2
@@ -1504,12 +1504,12 @@
 	movdqa	xmm1,[ebp]
 	lea	ebp,[ebp-640]
 	movdqu	xmm0,[ebx]
-db	102,15,56,0,193
+	pshufb	xmm0,xmm1
 	movdqa	xmm3,[ebp]
 	movdqa	xmm2,xmm1
 	movdqu	xmm1,[16+ebx]
 	paddq	xmm3,xmm0
-db	102,15,56,0,202
+	pshufb	xmm1,xmm2
 	movq	mm1,mm4
 	movq	mm7,[edx-128]
 	pxor	mm5,mm6
@@ -1607,7 +1607,7 @@
 	movdqa	xmm3,xmm2
 	movdqu	xmm2,[32+ebx]
 	paddq	xmm4,xmm1
-db	102,15,56,0,211
+	pshufb	xmm2,xmm3
 	movq	mm1,mm4
 	movq	mm7,[edx-112]
 	pxor	mm5,mm6
@@ -1705,7 +1705,7 @@
 	movdqa	xmm4,xmm3
 	movdqu	xmm3,[48+ebx]
 	paddq	xmm5,xmm2
-db	102,15,56,0,220
+	pshufb	xmm3,xmm4
 	movq	mm1,mm4
 	movq	mm7,[edx-96]
 	pxor	mm5,mm6
@@ -1803,7 +1803,7 @@
 	movdqa	xmm5,xmm4
 	movdqu	xmm4,[64+ebx]
 	paddq	xmm6,xmm3
-db	102,15,56,0,229
+	pshufb	xmm4,xmm5
 	movq	mm1,mm4
 	movq	mm7,[edx-80]
 	pxor	mm5,mm6
@@ -1901,7 +1901,7 @@
 	movdqa	xmm6,xmm5
 	movdqu	xmm5,[80+ebx]
 	paddq	xmm7,xmm4
-db	102,15,56,0,238
+	pshufb	xmm5,xmm6
 	movq	mm1,mm4
 	movq	mm7,[edx-64]
 	pxor	mm5,mm6
@@ -2000,7 +2000,7 @@
 	movdqa	xmm7,xmm6
 	movdqu	xmm6,[96+ebx]
 	paddq	xmm0,xmm5
-db	102,15,56,0,247
+	pshufb	xmm6,xmm7
 	movq	mm1,mm4
 	movq	mm7,[edx-48]
 	pxor	mm5,mm6
@@ -2099,7 +2099,7 @@
 	movdqa	xmm0,xmm7
 	movdqu	xmm7,[112+ebx]
 	paddq	xmm1,xmm6
-db	102,15,56,0,248
+	pshufb	xmm7,xmm0
 	movq	mm1,mm4
 	movq	mm7,[edx-32]
 	pxor	mm5,mm6
diff --git a/gen/bcm/vpaes-x86-apple.S b/gen/bcm/vpaes-x86-apple.S
index 02d3787..b6717d5 100644
--- a/gen/bcm/vpaes-x86-apple.S
+++ b/gen/bcm/vpaes-x86-apple.S
@@ -81,12 +81,12 @@
 	pandn	%xmm0,%xmm1
 	pand	%xmm6,%xmm0
 	movdqu	(%edx),%xmm5
-.byte	102,15,56,0,208
+	pshufb	%xmm0,%xmm2
 	movdqa	16(%ebp),%xmm0
 	pxor	%xmm5,%xmm2
 	psrld	$4,%xmm1
 	addl	$16,%edx
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	leal	192(%ebp),%ebx
 	pxor	%xmm2,%xmm0
 	jmp	L000enc_entry
@@ -94,25 +94,25 @@
 L001enc_loop:
 	movdqa	32(%ebp),%xmm4
 	movdqa	48(%ebp),%xmm0
-.byte	102,15,56,0,226
-.byte	102,15,56,0,195
+	pshufb	%xmm2,%xmm4
+	pshufb	%xmm3,%xmm0
 	pxor	%xmm5,%xmm4
 	movdqa	64(%ebp),%xmm5
 	pxor	%xmm4,%xmm0
 	movdqa	-64(%ebx,%ecx,1),%xmm1
-.byte	102,15,56,0,234
+	pshufb	%xmm2,%xmm5
 	movdqa	80(%ebp),%xmm2
 	movdqa	(%ebx,%ecx,1),%xmm4
-.byte	102,15,56,0,211
+	pshufb	%xmm3,%xmm2
 	movdqa	%xmm0,%xmm3
 	pxor	%xmm5,%xmm2
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	addl	$16,%edx
 	pxor	%xmm2,%xmm0
-.byte	102,15,56,0,220
+	pshufb	%xmm4,%xmm3
 	addl	$16,%ecx
 	pxor	%xmm0,%xmm3
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	andl	$48,%ecx
 	subl	$1,%eax
 	pxor	%xmm3,%xmm0
@@ -122,30 +122,30 @@
 	pandn	%xmm0,%xmm1
 	psrld	$4,%xmm1
 	pand	%xmm6,%xmm0
-.byte	102,15,56,0,232
+	pshufb	%xmm0,%xmm5
 	movdqa	%xmm7,%xmm3
 	pxor	%xmm1,%xmm0
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	movdqa	%xmm7,%xmm4
 	pxor	%xmm5,%xmm3
-.byte	102,15,56,0,224
+	pshufb	%xmm0,%xmm4
 	movdqa	%xmm7,%xmm2
 	pxor	%xmm5,%xmm4
-.byte	102,15,56,0,211
+	pshufb	%xmm3,%xmm2
 	movdqa	%xmm7,%xmm3
 	pxor	%xmm0,%xmm2
-.byte	102,15,56,0,220
+	pshufb	%xmm4,%xmm3
 	movdqu	(%edx),%xmm5
 	pxor	%xmm1,%xmm3
 	jnz	L001enc_loop
 	movdqa	96(%ebp),%xmm4
 	movdqa	112(%ebp),%xmm0
-.byte	102,15,56,0,226
+	pshufb	%xmm2,%xmm4
 	pxor	%xmm5,%xmm4
-.byte	102,15,56,0,195
+	pshufb	%xmm3,%xmm0
 	movdqa	64(%ebx,%ecx,1),%xmm1
 	pxor	%xmm4,%xmm0
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	ret
 .private_extern	__vpaes_decrypt_core
 .align	4
@@ -160,10 +160,10 @@
 	movdqu	(%edx),%xmm5
 	shll	$4,%ecx
 	pand	%xmm6,%xmm0
-.byte	102,15,56,0,208
+	pshufb	%xmm0,%xmm2
 	movdqa	-48(%ebx),%xmm0
 	xorl	$48,%ecx
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	andl	$48,%ecx
 	pxor	%xmm5,%xmm2
 	movdqa	176(%ebp),%xmm5
@@ -175,32 +175,32 @@
 L003dec_loop:
 	movdqa	-32(%ebx),%xmm4
 	movdqa	-16(%ebx),%xmm1
-.byte	102,15,56,0,226
-.byte	102,15,56,0,203
+	pshufb	%xmm2,%xmm4
+	pshufb	%xmm3,%xmm1
 	pxor	%xmm4,%xmm0
 	movdqa	(%ebx),%xmm4
 	pxor	%xmm1,%xmm0
 	movdqa	16(%ebx),%xmm1
-.byte	102,15,56,0,226
-.byte	102,15,56,0,197
-.byte	102,15,56,0,203
+	pshufb	%xmm2,%xmm4
+	pshufb	%xmm5,%xmm0
+	pshufb	%xmm3,%xmm1
 	pxor	%xmm4,%xmm0
 	movdqa	32(%ebx),%xmm4
 	pxor	%xmm1,%xmm0
 	movdqa	48(%ebx),%xmm1
-.byte	102,15,56,0,226
-.byte	102,15,56,0,197
-.byte	102,15,56,0,203
+	pshufb	%xmm2,%xmm4
+	pshufb	%xmm5,%xmm0
+	pshufb	%xmm3,%xmm1
 	pxor	%xmm4,%xmm0
 	movdqa	64(%ebx),%xmm4
 	pxor	%xmm1,%xmm0
 	movdqa	80(%ebx),%xmm1
-.byte	102,15,56,0,226
-.byte	102,15,56,0,197
-.byte	102,15,56,0,203
+	pshufb	%xmm2,%xmm4
+	pshufb	%xmm5,%xmm0
+	pshufb	%xmm3,%xmm1
 	pxor	%xmm4,%xmm0
 	addl	$16,%edx
-.byte	102,15,58,15,237,12
+	palignr	$12,%xmm5,%xmm5
 	pxor	%xmm1,%xmm0
 	subl	$1,%eax
 L002dec_entry:
@@ -209,30 +209,30 @@
 	pandn	%xmm0,%xmm1
 	pand	%xmm6,%xmm0
 	psrld	$4,%xmm1
-.byte	102,15,56,0,208
+	pshufb	%xmm0,%xmm2
 	movdqa	%xmm7,%xmm3
 	pxor	%xmm1,%xmm0
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	movdqa	%xmm7,%xmm4
 	pxor	%xmm2,%xmm3
-.byte	102,15,56,0,224
+	pshufb	%xmm0,%xmm4
 	pxor	%xmm2,%xmm4
 	movdqa	%xmm7,%xmm2
-.byte	102,15,56,0,211
+	pshufb	%xmm3,%xmm2
 	movdqa	%xmm7,%xmm3
 	pxor	%xmm0,%xmm2
-.byte	102,15,56,0,220
+	pshufb	%xmm4,%xmm3
 	movdqu	(%edx),%xmm0
 	pxor	%xmm1,%xmm3
 	jnz	L003dec_loop
 	movdqa	96(%ebx),%xmm4
-.byte	102,15,56,0,226
+	pshufb	%xmm2,%xmm4
 	pxor	%xmm0,%xmm4
 	movdqa	112(%ebx),%xmm0
 	movdqa	(%ecx),%xmm2
-.byte	102,15,56,0,195
+	pshufb	%xmm3,%xmm0
 	pxor	%xmm4,%xmm0
-.byte	102,15,56,0,194
+	pshufb	%xmm2,%xmm0
 	ret
 .private_extern	__vpaes_schedule_core
 .align	4
@@ -251,7 +251,7 @@
 	jmp	L005schedule_go
 L004schedule_am_decrypting:
 	movdqa	256(%ebp,%ecx,1),%xmm1
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	movdqu	%xmm3,(%edx)
 	xorl	$48,%ecx
 L005schedule_go:
@@ -276,7 +276,7 @@
 	movl	$4,%eax
 L011loop_schedule_192:
 	call	__vpaes_schedule_round
-.byte	102,15,58,15,198,8
+	palignr	$8,%xmm6,%xmm0
 	call	__vpaes_schedule_mangle
 	call	__vpaes_schedule_192_smear
 	call	__vpaes_schedule_mangle
@@ -310,7 +310,7 @@
 	testl	%edi,%edi
 	jnz	L013schedule_mangle_last_dec
 	movdqa	256(%ebp,%ecx,1),%xmm1
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	leal	352(%ebp),%ebx
 	addl	$32,%edx
 L013schedule_mangle_last_dec:
@@ -343,11 +343,11 @@
 __vpaes_schedule_round:
 	movdqa	8(%esp),%xmm2
 	pxor	%xmm1,%xmm1
-.byte	102,15,58,15,202,15
-.byte	102,15,58,15,210,15
+	palignr	$15,%xmm2,%xmm1
+	palignr	$15,%xmm2,%xmm2
 	pxor	%xmm1,%xmm7
 	pshufd	$255,%xmm0,%xmm0
-.byte	102,15,58,15,192,1
+	palignr	$1,%xmm0,%xmm0
 	movdqa	%xmm2,8(%esp)
 L_vpaes_schedule_low_round:
 	movdqa	%xmm7,%xmm1
@@ -364,24 +364,24 @@
 	psrld	$4,%xmm1
 	pand	%xmm4,%xmm0
 	movdqa	-32(%ebp),%xmm2
-.byte	102,15,56,0,208
+	pshufb	%xmm0,%xmm2
 	pxor	%xmm1,%xmm0
 	movdqa	%xmm5,%xmm3
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	pxor	%xmm2,%xmm3
 	movdqa	%xmm5,%xmm4
-.byte	102,15,56,0,224
+	pshufb	%xmm0,%xmm4
 	pxor	%xmm2,%xmm4
 	movdqa	%xmm5,%xmm2
-.byte	102,15,56,0,211
+	pshufb	%xmm3,%xmm2
 	pxor	%xmm0,%xmm2
 	movdqa	%xmm5,%xmm3
-.byte	102,15,56,0,220
+	pshufb	%xmm4,%xmm3
 	pxor	%xmm1,%xmm3
 	movdqa	32(%ebp),%xmm4
-.byte	102,15,56,0,226
+	pshufb	%xmm2,%xmm4
 	movdqa	48(%ebp),%xmm0
-.byte	102,15,56,0,195
+	pshufb	%xmm3,%xmm0
 	pxor	%xmm4,%xmm0
 	pxor	%xmm7,%xmm0
 	movdqa	%xmm0,%xmm7
@@ -395,9 +395,9 @@
 	psrld	$4,%xmm1
 	pand	%xmm2,%xmm0
 	movdqa	(%ebx),%xmm2
-.byte	102,15,56,0,208
+	pshufb	%xmm0,%xmm2
 	movdqa	16(%ebx),%xmm0
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	pxor	%xmm2,%xmm0
 	ret
 .private_extern	__vpaes_schedule_mangle
@@ -409,11 +409,11 @@
 	jnz	L014schedule_mangle_dec
 	addl	$16,%edx
 	pxor	336(%ebp),%xmm4
-.byte	102,15,56,0,229
+	pshufb	%xmm5,%xmm4
 	movdqa	%xmm4,%xmm3
-.byte	102,15,56,0,229
+	pshufb	%xmm5,%xmm4
 	pxor	%xmm4,%xmm3
-.byte	102,15,56,0,229
+	pshufb	%xmm5,%xmm4
 	pxor	%xmm4,%xmm3
 	jmp	L015schedule_mangle_both
 .align	4,0x90
@@ -425,35 +425,35 @@
 	psrld	$4,%xmm1
 	pand	%xmm2,%xmm4
 	movdqa	(%esi),%xmm2
-.byte	102,15,56,0,212
+	pshufb	%xmm4,%xmm2
 	movdqa	16(%esi),%xmm3
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	pxor	%xmm2,%xmm3
-.byte	102,15,56,0,221
+	pshufb	%xmm5,%xmm3
 	movdqa	32(%esi),%xmm2
-.byte	102,15,56,0,212
+	pshufb	%xmm4,%xmm2
 	pxor	%xmm3,%xmm2
 	movdqa	48(%esi),%xmm3
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	pxor	%xmm2,%xmm3
-.byte	102,15,56,0,221
+	pshufb	%xmm5,%xmm3
 	movdqa	64(%esi),%xmm2
-.byte	102,15,56,0,212
+	pshufb	%xmm4,%xmm2
 	pxor	%xmm3,%xmm2
 	movdqa	80(%esi),%xmm3
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	pxor	%xmm2,%xmm3
-.byte	102,15,56,0,221
+	pshufb	%xmm5,%xmm3
 	movdqa	96(%esi),%xmm2
-.byte	102,15,56,0,212
+	pshufb	%xmm4,%xmm2
 	pxor	%xmm3,%xmm2
 	movdqa	112(%esi),%xmm3
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	pxor	%xmm2,%xmm3
 	addl	$-16,%edx
 L015schedule_mangle_both:
 	movdqa	256(%ebp,%ecx,1),%xmm1
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	addl	$-16,%ecx
 	andl	$48,%ecx
 	movdqu	%xmm3,(%edx)
diff --git a/gen/bcm/vpaes-x86-linux.S b/gen/bcm/vpaes-x86-linux.S
index 31dc9a0..13da4aa 100644
--- a/gen/bcm/vpaes-x86-linux.S
+++ b/gen/bcm/vpaes-x86-linux.S
@@ -84,12 +84,12 @@
 	pandn	%xmm0,%xmm1
 	pand	%xmm6,%xmm0
 	movdqu	(%edx),%xmm5
-.byte	102,15,56,0,208
+	pshufb	%xmm0,%xmm2
 	movdqa	16(%ebp),%xmm0
 	pxor	%xmm5,%xmm2
 	psrld	$4,%xmm1
 	addl	$16,%edx
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	leal	192(%ebp),%ebx
 	pxor	%xmm2,%xmm0
 	jmp	.L000enc_entry
@@ -97,25 +97,25 @@
 .L001enc_loop:
 	movdqa	32(%ebp),%xmm4
 	movdqa	48(%ebp),%xmm0
-.byte	102,15,56,0,226
-.byte	102,15,56,0,195
+	pshufb	%xmm2,%xmm4
+	pshufb	%xmm3,%xmm0
 	pxor	%xmm5,%xmm4
 	movdqa	64(%ebp),%xmm5
 	pxor	%xmm4,%xmm0
 	movdqa	-64(%ebx,%ecx,1),%xmm1
-.byte	102,15,56,0,234
+	pshufb	%xmm2,%xmm5
 	movdqa	80(%ebp),%xmm2
 	movdqa	(%ebx,%ecx,1),%xmm4
-.byte	102,15,56,0,211
+	pshufb	%xmm3,%xmm2
 	movdqa	%xmm0,%xmm3
 	pxor	%xmm5,%xmm2
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	addl	$16,%edx
 	pxor	%xmm2,%xmm0
-.byte	102,15,56,0,220
+	pshufb	%xmm4,%xmm3
 	addl	$16,%ecx
 	pxor	%xmm0,%xmm3
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	andl	$48,%ecx
 	subl	$1,%eax
 	pxor	%xmm3,%xmm0
@@ -125,30 +125,30 @@
 	pandn	%xmm0,%xmm1
 	psrld	$4,%xmm1
 	pand	%xmm6,%xmm0
-.byte	102,15,56,0,232
+	pshufb	%xmm0,%xmm5
 	movdqa	%xmm7,%xmm3
 	pxor	%xmm1,%xmm0
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	movdqa	%xmm7,%xmm4
 	pxor	%xmm5,%xmm3
-.byte	102,15,56,0,224
+	pshufb	%xmm0,%xmm4
 	movdqa	%xmm7,%xmm2
 	pxor	%xmm5,%xmm4
-.byte	102,15,56,0,211
+	pshufb	%xmm3,%xmm2
 	movdqa	%xmm7,%xmm3
 	pxor	%xmm0,%xmm2
-.byte	102,15,56,0,220
+	pshufb	%xmm4,%xmm3
 	movdqu	(%edx),%xmm5
 	pxor	%xmm1,%xmm3
 	jnz	.L001enc_loop
 	movdqa	96(%ebp),%xmm4
 	movdqa	112(%ebp),%xmm0
-.byte	102,15,56,0,226
+	pshufb	%xmm2,%xmm4
 	pxor	%xmm5,%xmm4
-.byte	102,15,56,0,195
+	pshufb	%xmm3,%xmm0
 	movdqa	64(%ebx,%ecx,1),%xmm1
 	pxor	%xmm4,%xmm0
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	ret
 .size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
 .hidden	_vpaes_decrypt_core
@@ -165,10 +165,10 @@
 	movdqu	(%edx),%xmm5
 	shll	$4,%ecx
 	pand	%xmm6,%xmm0
-.byte	102,15,56,0,208
+	pshufb	%xmm0,%xmm2
 	movdqa	-48(%ebx),%xmm0
 	xorl	$48,%ecx
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	andl	$48,%ecx
 	pxor	%xmm5,%xmm2
 	movdqa	176(%ebp),%xmm5
@@ -180,32 +180,32 @@
 .L003dec_loop:
 	movdqa	-32(%ebx),%xmm4
 	movdqa	-16(%ebx),%xmm1
-.byte	102,15,56,0,226
-.byte	102,15,56,0,203
+	pshufb	%xmm2,%xmm4
+	pshufb	%xmm3,%xmm1
 	pxor	%xmm4,%xmm0
 	movdqa	(%ebx),%xmm4
 	pxor	%xmm1,%xmm0
 	movdqa	16(%ebx),%xmm1
-.byte	102,15,56,0,226
-.byte	102,15,56,0,197
-.byte	102,15,56,0,203
+	pshufb	%xmm2,%xmm4
+	pshufb	%xmm5,%xmm0
+	pshufb	%xmm3,%xmm1
 	pxor	%xmm4,%xmm0
 	movdqa	32(%ebx),%xmm4
 	pxor	%xmm1,%xmm0
 	movdqa	48(%ebx),%xmm1
-.byte	102,15,56,0,226
-.byte	102,15,56,0,197
-.byte	102,15,56,0,203
+	pshufb	%xmm2,%xmm4
+	pshufb	%xmm5,%xmm0
+	pshufb	%xmm3,%xmm1
 	pxor	%xmm4,%xmm0
 	movdqa	64(%ebx),%xmm4
 	pxor	%xmm1,%xmm0
 	movdqa	80(%ebx),%xmm1
-.byte	102,15,56,0,226
-.byte	102,15,56,0,197
-.byte	102,15,56,0,203
+	pshufb	%xmm2,%xmm4
+	pshufb	%xmm5,%xmm0
+	pshufb	%xmm3,%xmm1
 	pxor	%xmm4,%xmm0
 	addl	$16,%edx
-.byte	102,15,58,15,237,12
+	palignr	$12,%xmm5,%xmm5
 	pxor	%xmm1,%xmm0
 	subl	$1,%eax
 .L002dec_entry:
@@ -214,30 +214,30 @@
 	pandn	%xmm0,%xmm1
 	pand	%xmm6,%xmm0
 	psrld	$4,%xmm1
-.byte	102,15,56,0,208
+	pshufb	%xmm0,%xmm2
 	movdqa	%xmm7,%xmm3
 	pxor	%xmm1,%xmm0
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	movdqa	%xmm7,%xmm4
 	pxor	%xmm2,%xmm3
-.byte	102,15,56,0,224
+	pshufb	%xmm0,%xmm4
 	pxor	%xmm2,%xmm4
 	movdqa	%xmm7,%xmm2
-.byte	102,15,56,0,211
+	pshufb	%xmm3,%xmm2
 	movdqa	%xmm7,%xmm3
 	pxor	%xmm0,%xmm2
-.byte	102,15,56,0,220
+	pshufb	%xmm4,%xmm3
 	movdqu	(%edx),%xmm0
 	pxor	%xmm1,%xmm3
 	jnz	.L003dec_loop
 	movdqa	96(%ebx),%xmm4
-.byte	102,15,56,0,226
+	pshufb	%xmm2,%xmm4
 	pxor	%xmm0,%xmm4
 	movdqa	112(%ebx),%xmm0
 	movdqa	(%ecx),%xmm2
-.byte	102,15,56,0,195
+	pshufb	%xmm3,%xmm0
 	pxor	%xmm4,%xmm0
-.byte	102,15,56,0,194
+	pshufb	%xmm2,%xmm0
 	ret
 .size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
 .hidden	_vpaes_schedule_core
@@ -258,7 +258,7 @@
 	jmp	.L005schedule_go
 .L004schedule_am_decrypting:
 	movdqa	256(%ebp,%ecx,1),%xmm1
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	movdqu	%xmm3,(%edx)
 	xorl	$48,%ecx
 .L005schedule_go:
@@ -283,7 +283,7 @@
 	movl	$4,%eax
 .L011loop_schedule_192:
 	call	_vpaes_schedule_round
-.byte	102,15,58,15,198,8
+	palignr	$8,%xmm6,%xmm0
 	call	_vpaes_schedule_mangle
 	call	_vpaes_schedule_192_smear
 	call	_vpaes_schedule_mangle
@@ -317,7 +317,7 @@
 	testl	%edi,%edi
 	jnz	.L013schedule_mangle_last_dec
 	movdqa	256(%ebp,%ecx,1),%xmm1
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	leal	352(%ebp),%ebx
 	addl	$32,%edx
 .L013schedule_mangle_last_dec:
@@ -354,11 +354,11 @@
 _vpaes_schedule_round:
 	movdqa	8(%esp),%xmm2
 	pxor	%xmm1,%xmm1
-.byte	102,15,58,15,202,15
-.byte	102,15,58,15,210,15
+	palignr	$15,%xmm2,%xmm1
+	palignr	$15,%xmm2,%xmm2
 	pxor	%xmm1,%xmm7
 	pshufd	$255,%xmm0,%xmm0
-.byte	102,15,58,15,192,1
+	palignr	$1,%xmm0,%xmm0
 	movdqa	%xmm2,8(%esp)
 .L_vpaes_schedule_low_round:
 	movdqa	%xmm7,%xmm1
@@ -375,24 +375,24 @@
 	psrld	$4,%xmm1
 	pand	%xmm4,%xmm0
 	movdqa	-32(%ebp),%xmm2
-.byte	102,15,56,0,208
+	pshufb	%xmm0,%xmm2
 	pxor	%xmm1,%xmm0
 	movdqa	%xmm5,%xmm3
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	pxor	%xmm2,%xmm3
 	movdqa	%xmm5,%xmm4
-.byte	102,15,56,0,224
+	pshufb	%xmm0,%xmm4
 	pxor	%xmm2,%xmm4
 	movdqa	%xmm5,%xmm2
-.byte	102,15,56,0,211
+	pshufb	%xmm3,%xmm2
 	pxor	%xmm0,%xmm2
 	movdqa	%xmm5,%xmm3
-.byte	102,15,56,0,220
+	pshufb	%xmm4,%xmm3
 	pxor	%xmm1,%xmm3
 	movdqa	32(%ebp),%xmm4
-.byte	102,15,56,0,226
+	pshufb	%xmm2,%xmm4
 	movdqa	48(%ebp),%xmm0
-.byte	102,15,56,0,195
+	pshufb	%xmm3,%xmm0
 	pxor	%xmm4,%xmm0
 	pxor	%xmm7,%xmm0
 	movdqa	%xmm0,%xmm7
@@ -408,9 +408,9 @@
 	psrld	$4,%xmm1
 	pand	%xmm2,%xmm0
 	movdqa	(%ebx),%xmm2
-.byte	102,15,56,0,208
+	pshufb	%xmm0,%xmm2
 	movdqa	16(%ebx),%xmm0
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	pxor	%xmm2,%xmm0
 	ret
 .size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
@@ -424,11 +424,11 @@
 	jnz	.L014schedule_mangle_dec
 	addl	$16,%edx
 	pxor	336(%ebp),%xmm4
-.byte	102,15,56,0,229
+	pshufb	%xmm5,%xmm4
 	movdqa	%xmm4,%xmm3
-.byte	102,15,56,0,229
+	pshufb	%xmm5,%xmm4
 	pxor	%xmm4,%xmm3
-.byte	102,15,56,0,229
+	pshufb	%xmm5,%xmm4
 	pxor	%xmm4,%xmm3
 	jmp	.L015schedule_mangle_both
 .align	16
@@ -440,35 +440,35 @@
 	psrld	$4,%xmm1
 	pand	%xmm2,%xmm4
 	movdqa	(%esi),%xmm2
-.byte	102,15,56,0,212
+	pshufb	%xmm4,%xmm2
 	movdqa	16(%esi),%xmm3
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	pxor	%xmm2,%xmm3
-.byte	102,15,56,0,221
+	pshufb	%xmm5,%xmm3
 	movdqa	32(%esi),%xmm2
-.byte	102,15,56,0,212
+	pshufb	%xmm4,%xmm2
 	pxor	%xmm3,%xmm2
 	movdqa	48(%esi),%xmm3
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	pxor	%xmm2,%xmm3
-.byte	102,15,56,0,221
+	pshufb	%xmm5,%xmm3
 	movdqa	64(%esi),%xmm2
-.byte	102,15,56,0,212
+	pshufb	%xmm4,%xmm2
 	pxor	%xmm3,%xmm2
 	movdqa	80(%esi),%xmm3
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	pxor	%xmm2,%xmm3
-.byte	102,15,56,0,221
+	pshufb	%xmm5,%xmm3
 	movdqa	96(%esi),%xmm2
-.byte	102,15,56,0,212
+	pshufb	%xmm4,%xmm2
 	pxor	%xmm3,%xmm2
 	movdqa	112(%esi),%xmm3
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	pxor	%xmm2,%xmm3
 	addl	$-16,%edx
 .L015schedule_mangle_both:
 	movdqa	256(%ebp,%ecx,1),%xmm1
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	addl	$-16,%ecx
 	andl	$48,%ecx
 	movdqu	%xmm3,(%edx)
diff --git a/gen/bcm/vpaes-x86-win.asm b/gen/bcm/vpaes-x86-win.asm
index 137b31e..30ba96c 100644
--- a/gen/bcm/vpaes-x86-win.asm
+++ b/gen/bcm/vpaes-x86-win.asm
@@ -88,12 +88,12 @@
 	pandn	xmm1,xmm0
 	pand	xmm0,xmm6
 	movdqu	xmm5,[edx]
-db	102,15,56,0,208
+	pshufb	xmm2,xmm0
 	movdqa	xmm0,[16+ebp]
 	pxor	xmm2,xmm5
 	psrld	xmm1,4
 	add	edx,16
-db	102,15,56,0,193
+	pshufb	xmm0,xmm1
 	lea	ebx,[192+ebp]
 	pxor	xmm0,xmm2
 	jmp	NEAR L$000enc_entry
@@ -101,25 +101,25 @@
 L$001enc_loop:
 	movdqa	xmm4,[32+ebp]
 	movdqa	xmm0,[48+ebp]
-db	102,15,56,0,226
-db	102,15,56,0,195
+	pshufb	xmm4,xmm2
+	pshufb	xmm0,xmm3
 	pxor	xmm4,xmm5
 	movdqa	xmm5,[64+ebp]
 	pxor	xmm0,xmm4
 	movdqa	xmm1,[ecx*1+ebx-64]
-db	102,15,56,0,234
+	pshufb	xmm5,xmm2
 	movdqa	xmm2,[80+ebp]
 	movdqa	xmm4,[ecx*1+ebx]
-db	102,15,56,0,211
+	pshufb	xmm2,xmm3
 	movdqa	xmm3,xmm0
 	pxor	xmm2,xmm5
-db	102,15,56,0,193
+	pshufb	xmm0,xmm1
 	add	edx,16
 	pxor	xmm0,xmm2
-db	102,15,56,0,220
+	pshufb	xmm3,xmm4
 	add	ecx,16
 	pxor	xmm3,xmm0
-db	102,15,56,0,193
+	pshufb	xmm0,xmm1
 	and	ecx,48
 	sub	eax,1
 	pxor	xmm0,xmm3
@@ -129,30 +129,30 @@
 	pandn	xmm1,xmm0
 	psrld	xmm1,4
 	pand	xmm0,xmm6
-db	102,15,56,0,232
+	pshufb	xmm5,xmm0
 	movdqa	xmm3,xmm7
 	pxor	xmm0,xmm1
-db	102,15,56,0,217
+	pshufb	xmm3,xmm1
 	movdqa	xmm4,xmm7
 	pxor	xmm3,xmm5
-db	102,15,56,0,224
+	pshufb	xmm4,xmm0
 	movdqa	xmm2,xmm7
 	pxor	xmm4,xmm5
-db	102,15,56,0,211
+	pshufb	xmm2,xmm3
 	movdqa	xmm3,xmm7
 	pxor	xmm2,xmm0
-db	102,15,56,0,220
+	pshufb	xmm3,xmm4
 	movdqu	xmm5,[edx]
 	pxor	xmm3,xmm1
 	jnz	NEAR L$001enc_loop
 	movdqa	xmm4,[96+ebp]
 	movdqa	xmm0,[112+ebp]
-db	102,15,56,0,226
+	pshufb	xmm4,xmm2
 	pxor	xmm4,xmm5
-db	102,15,56,0,195
+	pshufb	xmm0,xmm3
 	movdqa	xmm1,[64+ecx*1+ebx]
 	pxor	xmm0,xmm4
-db	102,15,56,0,193
+	pshufb	xmm0,xmm1
 	ret
 align	16
 __vpaes_decrypt_core:
@@ -166,10 +166,10 @@
 	movdqu	xmm5,[edx]
 	shl	ecx,4
 	pand	xmm0,xmm6
-db	102,15,56,0,208
+	pshufb	xmm2,xmm0
 	movdqa	xmm0,[ebx-48]
 	xor	ecx,48
-db	102,15,56,0,193
+	pshufb	xmm0,xmm1
 	and	ecx,48
 	pxor	xmm2,xmm5
 	movdqa	xmm5,[176+ebp]
@@ -181,32 +181,32 @@
 L$003dec_loop:
 	movdqa	xmm4,[ebx-32]
 	movdqa	xmm1,[ebx-16]
-db	102,15,56,0,226
-db	102,15,56,0,203
+	pshufb	xmm4,xmm2
+	pshufb	xmm1,xmm3
 	pxor	xmm0,xmm4
 	movdqa	xmm4,[ebx]
 	pxor	xmm0,xmm1
 	movdqa	xmm1,[16+ebx]
-db	102,15,56,0,226
-db	102,15,56,0,197
-db	102,15,56,0,203
+	pshufb	xmm4,xmm2
+	pshufb	xmm0,xmm5
+	pshufb	xmm1,xmm3
 	pxor	xmm0,xmm4
 	movdqa	xmm4,[32+ebx]
 	pxor	xmm0,xmm1
 	movdqa	xmm1,[48+ebx]
-db	102,15,56,0,226
-db	102,15,56,0,197
-db	102,15,56,0,203
+	pshufb	xmm4,xmm2
+	pshufb	xmm0,xmm5
+	pshufb	xmm1,xmm3
 	pxor	xmm0,xmm4
 	movdqa	xmm4,[64+ebx]
 	pxor	xmm0,xmm1
 	movdqa	xmm1,[80+ebx]
-db	102,15,56,0,226
-db	102,15,56,0,197
-db	102,15,56,0,203
+	pshufb	xmm4,xmm2
+	pshufb	xmm0,xmm5
+	pshufb	xmm1,xmm3
 	pxor	xmm0,xmm4
 	add	edx,16
-db	102,15,58,15,237,12
+	palignr	xmm5,xmm5,12
 	pxor	xmm0,xmm1
 	sub	eax,1
 L$002dec_entry:
@@ -215,30 +215,30 @@
 	pandn	xmm1,xmm0
 	pand	xmm0,xmm6
 	psrld	xmm1,4
-db	102,15,56,0,208
+	pshufb	xmm2,xmm0
 	movdqa	xmm3,xmm7
 	pxor	xmm0,xmm1
-db	102,15,56,0,217
+	pshufb	xmm3,xmm1
 	movdqa	xmm4,xmm7
 	pxor	xmm3,xmm2
-db	102,15,56,0,224
+	pshufb	xmm4,xmm0
 	pxor	xmm4,xmm2
 	movdqa	xmm2,xmm7
-db	102,15,56,0,211
+	pshufb	xmm2,xmm3
 	movdqa	xmm3,xmm7
 	pxor	xmm2,xmm0
-db	102,15,56,0,220
+	pshufb	xmm3,xmm4
 	movdqu	xmm0,[edx]
 	pxor	xmm3,xmm1
 	jnz	NEAR L$003dec_loop
 	movdqa	xmm4,[96+ebx]
-db	102,15,56,0,226
+	pshufb	xmm4,xmm2
 	pxor	xmm4,xmm0
 	movdqa	xmm0,[112+ebx]
 	movdqa	xmm2,[ecx]
-db	102,15,56,0,195
+	pshufb	xmm0,xmm3
 	pxor	xmm0,xmm4
-db	102,15,56,0,194
+	pshufb	xmm0,xmm2
 	ret
 align	16
 __vpaes_schedule_core:
@@ -256,7 +256,7 @@
 	jmp	NEAR L$005schedule_go
 L$004schedule_am_decrypting:
 	movdqa	xmm1,[256+ecx*1+ebp]
-db	102,15,56,0,217
+	pshufb	xmm3,xmm1
 	movdqu	[edx],xmm3
 	xor	ecx,48
 L$005schedule_go:
@@ -281,7 +281,7 @@
 	mov	eax,4
 L$011loop_schedule_192:
 	call	__vpaes_schedule_round
-db	102,15,58,15,198,8
+	palignr	xmm0,xmm6,8
 	call	__vpaes_schedule_mangle
 	call	__vpaes_schedule_192_smear
 	call	__vpaes_schedule_mangle
@@ -315,7 +315,7 @@
 	test	edi,edi
 	jnz	NEAR L$013schedule_mangle_last_dec
 	movdqa	xmm1,[256+ecx*1+ebp]
-db	102,15,56,0,193
+	pshufb	xmm0,xmm1
 	lea	ebx,[352+ebp]
 	add	edx,32
 L$013schedule_mangle_last_dec:
@@ -346,11 +346,11 @@
 __vpaes_schedule_round:
 	movdqa	xmm2,[8+esp]
 	pxor	xmm1,xmm1
-db	102,15,58,15,202,15
-db	102,15,58,15,210,15
+	palignr	xmm1,xmm2,15
+	palignr	xmm2,xmm2,15
 	pxor	xmm7,xmm1
 	pshufd	xmm0,xmm0,255
-db	102,15,58,15,192,1
+	palignr	xmm0,xmm0,1
 	movdqa	[8+esp],xmm2
 L$_vpaes_schedule_low_round:
 	movdqa	xmm1,xmm7
@@ -367,24 +367,24 @@
 	psrld	xmm1,4
 	pand	xmm0,xmm4
 	movdqa	xmm2,[ebp-32]
-db	102,15,56,0,208
+	pshufb	xmm2,xmm0
 	pxor	xmm0,xmm1
 	movdqa	xmm3,xmm5
-db	102,15,56,0,217
+	pshufb	xmm3,xmm1
 	pxor	xmm3,xmm2
 	movdqa	xmm4,xmm5
-db	102,15,56,0,224
+	pshufb	xmm4,xmm0
 	pxor	xmm4,xmm2
 	movdqa	xmm2,xmm5
-db	102,15,56,0,211
+	pshufb	xmm2,xmm3
 	pxor	xmm2,xmm0
 	movdqa	xmm3,xmm5
-db	102,15,56,0,220
+	pshufb	xmm3,xmm4
 	pxor	xmm3,xmm1
 	movdqa	xmm4,[32+ebp]
-db	102,15,56,0,226
+	pshufb	xmm4,xmm2
 	movdqa	xmm0,[48+ebp]
-db	102,15,56,0,195
+	pshufb	xmm0,xmm3
 	pxor	xmm0,xmm4
 	pxor	xmm0,xmm7
 	movdqa	xmm7,xmm0
@@ -397,9 +397,9 @@
 	psrld	xmm1,4
 	pand	xmm0,xmm2
 	movdqa	xmm2,[ebx]
-db	102,15,56,0,208
+	pshufb	xmm2,xmm0
 	movdqa	xmm0,[16+ebx]
-db	102,15,56,0,193
+	pshufb	xmm0,xmm1
 	pxor	xmm0,xmm2
 	ret
 align	16
@@ -410,11 +410,11 @@
 	jnz	NEAR L$014schedule_mangle_dec
 	add	edx,16
 	pxor	xmm4,[336+ebp]
-db	102,15,56,0,229
+	pshufb	xmm4,xmm5
 	movdqa	xmm3,xmm4
-db	102,15,56,0,229
+	pshufb	xmm4,xmm5
 	pxor	xmm3,xmm4
-db	102,15,56,0,229
+	pshufb	xmm4,xmm5
 	pxor	xmm3,xmm4
 	jmp	NEAR L$015schedule_mangle_both
 align	16
@@ -426,35 +426,35 @@
 	psrld	xmm1,4
 	pand	xmm4,xmm2
 	movdqa	xmm2,[esi]
-db	102,15,56,0,212
+	pshufb	xmm2,xmm4
 	movdqa	xmm3,[16+esi]
-db	102,15,56,0,217
+	pshufb	xmm3,xmm1
 	pxor	xmm3,xmm2
-db	102,15,56,0,221
+	pshufb	xmm3,xmm5
 	movdqa	xmm2,[32+esi]
-db	102,15,56,0,212
+	pshufb	xmm2,xmm4
 	pxor	xmm2,xmm3
 	movdqa	xmm3,[48+esi]
-db	102,15,56,0,217
+	pshufb	xmm3,xmm1
 	pxor	xmm3,xmm2
-db	102,15,56,0,221
+	pshufb	xmm3,xmm5
 	movdqa	xmm2,[64+esi]
-db	102,15,56,0,212
+	pshufb	xmm2,xmm4
 	pxor	xmm2,xmm3
 	movdqa	xmm3,[80+esi]
-db	102,15,56,0,217
+	pshufb	xmm3,xmm1
 	pxor	xmm3,xmm2
-db	102,15,56,0,221
+	pshufb	xmm3,xmm5
 	movdqa	xmm2,[96+esi]
-db	102,15,56,0,212
+	pshufb	xmm2,xmm4
 	pxor	xmm2,xmm3
 	movdqa	xmm3,[112+esi]
-db	102,15,56,0,217
+	pshufb	xmm3,xmm1
 	pxor	xmm3,xmm2
 	add	edx,-16
 L$015schedule_mangle_both:
 	movdqa	xmm1,[256+ecx*1+ebp]
-db	102,15,56,0,217
+	pshufb	xmm3,xmm1
 	add	ecx,-16
 	and	ecx,48
 	movdqu	[edx],xmm3
diff --git a/gen/crypto/chacha-x86-apple.S b/gen/crypto/chacha-x86-apple.S
index 48293da..c03fb5b 100644
--- a/gen/crypto/chacha-x86-apple.S
+++ b/gen/crypto/chacha-x86-apple.S
@@ -850,7 +850,7 @@
 L010loop1x:
 	paddd	%xmm1,%xmm0
 	pxor	%xmm0,%xmm3
-.byte	102,15,56,0,222
+	pshufb	%xmm6,%xmm3
 	paddd	%xmm3,%xmm2
 	pxor	%xmm2,%xmm1
 	movdqa	%xmm1,%xmm4
@@ -859,7 +859,7 @@
 	por	%xmm4,%xmm1
 	paddd	%xmm1,%xmm0
 	pxor	%xmm0,%xmm3
-.byte	102,15,56,0,223
+	pshufb	%xmm7,%xmm3
 	paddd	%xmm3,%xmm2
 	pxor	%xmm2,%xmm1
 	movdqa	%xmm1,%xmm4
@@ -872,7 +872,7 @@
 	nop
 	paddd	%xmm1,%xmm0
 	pxor	%xmm0,%xmm3
-.byte	102,15,56,0,222
+	pshufb	%xmm6,%xmm3
 	paddd	%xmm3,%xmm2
 	pxor	%xmm2,%xmm1
 	movdqa	%xmm1,%xmm4
@@ -881,7 +881,7 @@
 	por	%xmm4,%xmm1
 	paddd	%xmm1,%xmm0
 	pxor	%xmm0,%xmm3
-.byte	102,15,56,0,223
+	pshufb	%xmm7,%xmm3
 	paddd	%xmm3,%xmm2
 	pxor	%xmm2,%xmm1
 	movdqa	%xmm1,%xmm4
diff --git a/gen/crypto/chacha-x86-linux.S b/gen/crypto/chacha-x86-linux.S
index 566fbb4..9ad20a0 100644
--- a/gen/crypto/chacha-x86-linux.S
+++ b/gen/crypto/chacha-x86-linux.S
@@ -853,7 +853,7 @@
 .L010loop1x:
 	paddd	%xmm1,%xmm0
 	pxor	%xmm0,%xmm3
-.byte	102,15,56,0,222
+	pshufb	%xmm6,%xmm3
 	paddd	%xmm3,%xmm2
 	pxor	%xmm2,%xmm1
 	movdqa	%xmm1,%xmm4
@@ -862,7 +862,7 @@
 	por	%xmm4,%xmm1
 	paddd	%xmm1,%xmm0
 	pxor	%xmm0,%xmm3
-.byte	102,15,56,0,223
+	pshufb	%xmm7,%xmm3
 	paddd	%xmm3,%xmm2
 	pxor	%xmm2,%xmm1
 	movdqa	%xmm1,%xmm4
@@ -875,7 +875,7 @@
 	nop
 	paddd	%xmm1,%xmm0
 	pxor	%xmm0,%xmm3
-.byte	102,15,56,0,222
+	pshufb	%xmm6,%xmm3
 	paddd	%xmm3,%xmm2
 	pxor	%xmm2,%xmm1
 	movdqa	%xmm1,%xmm4
@@ -884,7 +884,7 @@
 	por	%xmm4,%xmm1
 	paddd	%xmm1,%xmm0
 	pxor	%xmm0,%xmm3
-.byte	102,15,56,0,223
+	pshufb	%xmm7,%xmm3
 	paddd	%xmm3,%xmm2
 	pxor	%xmm2,%xmm1
 	movdqa	%xmm1,%xmm4
diff --git a/gen/crypto/chacha-x86-win.asm b/gen/crypto/chacha-x86-win.asm
index 1e64634..799a6aa 100644
--- a/gen/crypto/chacha-x86-win.asm
+++ b/gen/crypto/chacha-x86-win.asm
@@ -856,7 +856,7 @@
 L$010loop1x:
 	paddd	xmm0,xmm1
 	pxor	xmm3,xmm0
-db	102,15,56,0,222
+	pshufb	xmm3,xmm6
 	paddd	xmm2,xmm3
 	pxor	xmm1,xmm2
 	movdqa	xmm4,xmm1
@@ -865,7 +865,7 @@
 	por	xmm1,xmm4
 	paddd	xmm0,xmm1
 	pxor	xmm3,xmm0
-db	102,15,56,0,223
+	pshufb	xmm3,xmm7
 	paddd	xmm2,xmm3
 	pxor	xmm1,xmm2
 	movdqa	xmm4,xmm1
@@ -878,7 +878,7 @@
 	nop
 	paddd	xmm0,xmm1
 	pxor	xmm3,xmm0
-db	102,15,56,0,222
+	pshufb	xmm3,xmm6
 	paddd	xmm2,xmm3
 	pxor	xmm1,xmm2
 	movdqa	xmm4,xmm1
@@ -887,7 +887,7 @@
 	por	xmm1,xmm4
 	paddd	xmm0,xmm1
 	pxor	xmm3,xmm0
-db	102,15,56,0,223
+	pshufb	xmm3,xmm7
 	paddd	xmm2,xmm3
 	pxor	xmm1,xmm2
 	movdqa	xmm4,xmm1