Stop manually encoding a bunch of x86-64 instructions

The perlasm code used to manually encode some instructions, presumably
to accomodate older assemblers that don't recognize them. The newest of
these (SHA instructions) seem to have been added in binutils 2.24,
released in 2013.

Remove the transforms so we don't have to worry about bugs in some
ad-hoc perl code. I confirmed this was equivalent by comparing the
output of `objdump -d` on the assembled object files.

This revealed one issue in the xlate script where it tried to suffix
rdrand, which is apparently unsuffixable.

Change-Id: I51377e38ec06b099e730da29b85743188abf9723
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/77388
Commit-Queue: Bob Beck <bbe@google.com>
Reviewed-by: Bob Beck <bbe@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
Auto-Submit: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/aes/asm/aesni-x86_64.pl b/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
index 08df44e..930ae14 100644
--- a/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
+++ b/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
@@ -3983,64 +3983,7 @@
 ___
 }
 
-sub rex {
-  local *opcode=shift;
-  my ($dst,$src)=@_;
-  my $rex=0;
-
-    $rex|=0x04			if($dst>=8);
-    $rex|=0x01			if($src>=8);
-    push @opcode,$rex|0x40	if($rex);
-}
-
-sub aesni {
-  my $line=shift;
-  my @opcode=(0x66);
-
-    if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
-	rex(\@opcode,$4,$3);
-	push @opcode,0x0f,0x3a,0xdf;
-	push @opcode,0xc0|($3&7)|(($4&7)<<3);	# ModR/M
-	my $c=$2;
-	push @opcode,$c=~/^0/?oct($c):$c;
-	return ".byte\t".join(',',@opcode);
-    }
-    elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
-	my %opcodelet = (
-		"aesimc" => 0xdb,
-		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
-		"aesdec" => 0xde,	"aesdeclast" => 0xdf
-	);
-	return undef if (!defined($opcodelet{$1}));
-	rex(\@opcode,$3,$2);
-	push @opcode,0x0f,0x38,$opcodelet{$1};
-	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
-	return ".byte\t".join(',',@opcode);
-    }
-    elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
-	my %opcodelet = (
-		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
-		"aesdec" => 0xde,	"aesdeclast" => 0xdf
-	);
-	return undef if (!defined($opcodelet{$1}));
-	my $off = $2;
-	push @opcode,0x44 if ($3>=8);
-	push @opcode,0x0f,0x38,$opcodelet{$1};
-	push @opcode,0x44|(($3&7)<<3),0x24;	# ModR/M
-	push @opcode,($off=~/^0/?oct($off):$off)&0xff;
-	return ".byte\t".join(',',@opcode);
-    }
-    return $line;
-}
-
-sub movbe {
-	".byte	0x0f,0x38,0xf1,0x44,0x24,".shift;
-}
-
 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
-$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
-#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm;	# debugging artefact
-$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
 
 print $code;
 
diff --git a/crypto/fipsmodule/sha/asm/sha1-x86_64.pl b/crypto/fipsmodule/sha/asm/sha1-x86_64.pl
index 2dc24f2..30f1238 100755
--- a/crypto/fipsmodule/sha/asm/sha1-x86_64.pl
+++ b/crypto/fipsmodule/sha/asm/sha1-x86_64.pl
@@ -2057,45 +2057,9 @@
 
 ####################################################################
 
-sub sha1rnds4 {
-    if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
-      my @opcode=(0x0f,0x3a,0xcc);
-	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
-	my $c=$1;
-	push @opcode,$c=~/^0/?oct($c):$c;
-	return ".byte\t".join(',',@opcode);
-    } else {
-	return "sha1rnds4\t".@_[0];
-    }
-}
-
-sub sha1op38 {
-    my $instr = shift;
-    my %opcodelet = (
-		"sha1nexte" => 0xc8,
-  		"sha1msg1"  => 0xc9,
-		"sha1msg2"  => 0xca	);
-
-    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
-      my @opcode=(0x0f,0x38);
-      my $rex=0;
-	$rex|=0x04			if ($2>=8);
-	$rex|=0x01			if ($1>=8);
-	unshift @opcode,0x40|$rex	if ($rex);
-	push @opcode,$opcodelet{$instr};
-	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
-	return ".byte\t".join(',',@opcode);
-    } else {
-	return $instr."\t".@_[0];
-    }
-}
-
 foreach (split("\n",$code)) {
 	s/\`([^\`]*)\`/eval $1/geo;
 
-	s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo	or
-	s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;
-
 	print $_,"\n";
 }
 close STDOUT or die "error closing STDOUT: $!";
diff --git a/crypto/fipsmodule/sha/asm/sha512-x86_64.pl b/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
index 3a31a16..6768bf3 100755
--- a/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
+++ b/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
@@ -2042,28 +2042,9 @@
 ___
 }
 
-sub sha256op38 {
-    my $instr = shift;
-    my %opcodelet = (
-		"sha256rnds2" => 0xcb,
-  		"sha256msg1"  => 0xcc,
-		"sha256msg2"  => 0xcd	);
-
-    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
-      my @opcode=(0x0f,0x38);
-	push @opcode,$opcodelet{$instr};
-	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
-	return ".byte\t".join(',',@opcode);
-    } else {
-	return $instr."\t".@_[0];
-    }
-}
-
 foreach (split("\n",$code)) {
 	s/\`([^\`]*)\`/eval $1/geo;
 
-	s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
-
 	print $_,"\n";
 }
 close STDOUT or die "error closing STDOUT: $!";
diff --git a/crypto/perlasm/x86_64-xlate.pl b/crypto/perlasm/x86_64-xlate.pl
index a0bade3..2045529 100755
--- a/crypto/perlasm/x86_64-xlate.pl
+++ b/crypto/perlasm/x86_64-xlate.pl
@@ -127,7 +127,7 @@
 	    if ($self->{op} =~ /^(movz)x?([bw]).*/) {	# movz is pain...
 		$self->{op} = $1;
 		$self->{sz} = $2;
-	    } elsif ($self->{op} =~ /call|jmp/) {
+	    } elsif ($self->{op} =~ /call|jmp|^rdrand$/) {
 		$self->{sz} = "";
 	    } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op|insrw)/) { # SSEn
 		$self->{sz} = "";
@@ -1319,191 +1319,6 @@
     }
 }
 
-# Upon initial x86_64 introduction SSE>2 extensions were not introduced
-# yet. In order not to be bothered by tracing exact assembler versions,
-# but at the same time to provide a bare security minimum of AES-NI, we
-# hard-code some instructions. Extensions past AES-NI on the other hand
-# are traced by examining assembler version in individual perlasm
-# modules...
-
-my %regrm = (	"%eax"=>0, "%ecx"=>1, "%edx"=>2, "%ebx"=>3,
-		"%esp"=>4, "%ebp"=>5, "%esi"=>6, "%edi"=>7	);
-
-sub rex {
- my $opcode=shift;
- my ($dst,$src,$rex)=@_;
-
-   $rex|=0x04 if($dst>=8);
-   $rex|=0x01 if($src>=8);
-   push @$opcode,($rex|0x40) if ($rex);
-}
-
-my $movq = sub {	# elderly gas can't handle inter-register movq
-  my $arg = shift;
-  my @opcode=(0x66);
-    if ($arg =~ /%xmm([0-9]+),\s*%r(\w+)/) {
-	my ($src,$dst)=($1,$2);
-	if ($dst !~ /[0-9]+/)	{ $dst = $regrm{"%e$dst"}; }
-	rex(\@opcode,$src,$dst,0x8);
-	push @opcode,0x0f,0x7e;
-	push @opcode,0xc0|(($src&7)<<3)|($dst&7);	# ModR/M
-	@opcode;
-    } elsif ($arg =~ /%r(\w+),\s*%xmm([0-9]+)/) {
-	my ($src,$dst)=($2,$1);
-	if ($dst !~ /[0-9]+/)	{ $dst = $regrm{"%e$dst"}; }
-	rex(\@opcode,$src,$dst,0x8);
-	push @opcode,0x0f,0x6e;
-	push @opcode,0xc0|(($src&7)<<3)|($dst&7);	# ModR/M
-	@opcode;
-    } else {
-	();
-    }
-};
-
-my $pextrd = sub {
-    if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*(%\w+)/) {
-      my @opcode=(0x66);
-	my $imm=$1;
-	my $src=$2;
-	my $dst=$3;
-	if ($dst =~ /%r([0-9]+)d/)	{ $dst = $1; }
-	elsif ($dst =~ /%e/)		{ $dst = $regrm{$dst}; }
-	rex(\@opcode,$src,$dst);
-	push @opcode,0x0f,0x3a,0x16;
-	push @opcode,0xc0|(($src&7)<<3)|($dst&7);	# ModR/M
-	push @opcode,$imm;
-	@opcode;
-    } else {
-	();
-    }
-};
-
-my $pinsrd = sub {
-    if (shift =~ /\$([0-9]+),\s*(%\w+),\s*%xmm([0-9]+)/) {
-      my @opcode=(0x66);
-	my $imm=$1;
-	my $src=$2;
-	my $dst=$3;
-	if ($src =~ /%r([0-9]+)/)	{ $src = $1; }
-	elsif ($src =~ /%e/)		{ $src = $regrm{$src}; }
-	rex(\@opcode,$dst,$src);
-	push @opcode,0x0f,0x3a,0x22;
-	push @opcode,0xc0|(($dst&7)<<3)|($src&7);	# ModR/M
-	push @opcode,$imm;
-	@opcode;
-    } else {
-	();
-    }
-};
-
-my $pshufb = sub {
-    if (shift =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
-      my @opcode=(0x66);
-	rex(\@opcode,$2,$1);
-	push @opcode,0x0f,0x38,0x00;
-	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
-	@opcode;
-    } else {
-	();
-    }
-};
-
-my $palignr = sub {
-    if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
-      my @opcode=(0x66);
-	rex(\@opcode,$3,$2);
-	push @opcode,0x0f,0x3a,0x0f;
-	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
-	push @opcode,$1;
-	@opcode;
-    } else {
-	();
-    }
-};
-
-my $pclmulqdq = sub {
-    if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
-      my @opcode=(0x66);
-	rex(\@opcode,$3,$2);
-	push @opcode,0x0f,0x3a,0x44;
-	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
-	my $c=$1;
-	push @opcode,$c=~/^0/?oct($c):$c;
-	@opcode;
-    } else {
-	();
-    }
-};
-
-my $rdrand = sub {
-    if (shift =~ /%[er](\w+)/) {
-      my @opcode=();
-      my $dst=$1;
-	if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
-	rex(\@opcode,0,$dst,8);
-	push @opcode,0x0f,0xc7,0xf0|($dst&7);
-	@opcode;
-    } else {
-	();
-    }
-};
-
-my $rdseed = sub {
-    if (shift =~ /%[er](\w+)/) {
-      my @opcode=();
-      my $dst=$1;
-	if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
-	rex(\@opcode,0,$dst,8);
-	push @opcode,0x0f,0xc7,0xf8|($dst&7);
-	@opcode;
-    } else {
-	();
-    }
-};
-
-# Not all AVX-capable assemblers recognize AMD XOP extension. Since we
-# are using only two instructions hand-code them in order to be excused
-# from chasing assembler versions...
-
-sub rxb {
- my $opcode=shift;
- my ($dst,$src1,$src2,$rxb)=@_;
-
-   $rxb|=0x7<<5;
-   $rxb&=~(0x04<<5) if($dst>=8);
-   $rxb&=~(0x01<<5) if($src1>=8);
-   $rxb&=~(0x02<<5) if($src2>=8);
-   push @$opcode,$rxb;
-}
-
-my $vprotd = sub {
-    if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
-      my @opcode=(0x8f);
-	rxb(\@opcode,$3,$2,-1,0x08);
-	push @opcode,0x78,0xc2;
-	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
-	my $c=$1;
-	push @opcode,$c=~/^0/?oct($c):$c;
-	@opcode;
-    } else {
-	();
-    }
-};
-
-my $vprotq = sub {
-    if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
-      my @opcode=(0x8f);
-	rxb(\@opcode,$3,$2,-1,0x08);
-	push @opcode,0x78,0xc3;
-	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
-	my $c=$1;
-	push @opcode,$c=~/^0/?oct($c):$c;
-	@opcode;
-    } else {
-	();
-    }
-};
-
 # Intel Control-flow Enforcement Technology extension. All functions and
 # indirect branch targets will have to start with this instruction...
 
diff --git a/gen/bcm/aesni-x86_64-apple.S b/gen/bcm/aesni-x86_64-apple.S
index 0247a2d..958cc5a 100644
--- a/gen/bcm/aesni-x86_64-apple.S
+++ b/gen/bcm/aesni-x86_64-apple.S
@@ -23,12 +23,12 @@
 	leaq	32(%rdx),%rdx
 	xorps	%xmm0,%xmm2
 L$oop_enc1_1:
-.byte	102,15,56,220,209
+	aesenc	%xmm1,%xmm2
 	decl	%eax
 	movups	(%rdx),%xmm1
 	leaq	16(%rdx),%rdx
 	jnz	L$oop_enc1_1
-.byte	102,15,56,221,209
+	aesenclast	%xmm1,%xmm2
 	pxor	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
 	movups	%xmm2,(%rsi)
@@ -51,12 +51,12 @@
 	leaq	32(%rdx),%rdx
 	xorps	%xmm0,%xmm2
 L$oop_dec1_2:
-.byte	102,15,56,222,209
+	aesdec	%xmm1,%xmm2
 	decl	%eax
 	movups	(%rdx),%xmm1
 	leaq	16(%rdx),%rdx
 	jnz	L$oop_dec1_2
-.byte	102,15,56,223,209
+	aesdeclast	%xmm1,%xmm2
 	pxor	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
 	movups	%xmm2,(%rsi)
@@ -79,19 +79,19 @@
 	addq	$16,%rax
 
 L$enc_loop2:
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
 	movups	(%rcx,%rax,1),%xmm1
 	addq	$32,%rax
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
+	aesenc	%xmm0,%xmm2
+	aesenc	%xmm0,%xmm3
 	movups	-16(%rcx,%rax,1),%xmm0
 	jnz	L$enc_loop2
 
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,221,208
-.byte	102,15,56,221,216
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
+	aesenclast	%xmm0,%xmm2
+	aesenclast	%xmm0,%xmm3
 	ret
 
 
@@ -110,19 +110,19 @@
 	addq	$16,%rax
 
 L$dec_loop2:
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
 	movups	(%rcx,%rax,1),%xmm1
 	addq	$32,%rax
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
+	aesdec	%xmm0,%xmm2
+	aesdec	%xmm0,%xmm3
 	movups	-16(%rcx,%rax,1),%xmm0
 	jnz	L$dec_loop2
 
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,223,208
-.byte	102,15,56,223,216
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
+	aesdeclast	%xmm0,%xmm2
+	aesdeclast	%xmm0,%xmm3
 	ret
 
 
@@ -142,23 +142,23 @@
 	addq	$16,%rax
 
 L$enc_loop3:
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
+	aesenc	%xmm1,%xmm4
 	movups	(%rcx,%rax,1),%xmm1
 	addq	$32,%rax
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-.byte	102,15,56,220,224
+	aesenc	%xmm0,%xmm2
+	aesenc	%xmm0,%xmm3
+	aesenc	%xmm0,%xmm4
 	movups	-16(%rcx,%rax,1),%xmm0
 	jnz	L$enc_loop3
 
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,221,208
-.byte	102,15,56,221,216
-.byte	102,15,56,221,224
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
+	aesenc	%xmm1,%xmm4
+	aesenclast	%xmm0,%xmm2
+	aesenclast	%xmm0,%xmm3
+	aesenclast	%xmm0,%xmm4
 	ret
 
 
@@ -178,23 +178,23 @@
 	addq	$16,%rax
 
 L$dec_loop3:
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
+	aesdec	%xmm1,%xmm4
 	movups	(%rcx,%rax,1),%xmm1
 	addq	$32,%rax
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
+	aesdec	%xmm0,%xmm2
+	aesdec	%xmm0,%xmm3
+	aesdec	%xmm0,%xmm4
 	movups	-16(%rcx,%rax,1),%xmm0
 	jnz	L$dec_loop3
 
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,223,208
-.byte	102,15,56,223,216
-.byte	102,15,56,223,224
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
+	aesdec	%xmm1,%xmm4
+	aesdeclast	%xmm0,%xmm2
+	aesdeclast	%xmm0,%xmm3
+	aesdeclast	%xmm0,%xmm4
 	ret
 
 
@@ -216,27 +216,27 @@
 	addq	$16,%rax
 
 L$enc_loop4:
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
+	aesenc	%xmm1,%xmm4
+	aesenc	%xmm1,%xmm5
 	movups	(%rcx,%rax,1),%xmm1
 	addq	$32,%rax
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-.byte	102,15,56,220,224
-.byte	102,15,56,220,232
+	aesenc	%xmm0,%xmm2
+	aesenc	%xmm0,%xmm3
+	aesenc	%xmm0,%xmm4
+	aesenc	%xmm0,%xmm5
 	movups	-16(%rcx,%rax,1),%xmm0
 	jnz	L$enc_loop4
 
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-.byte	102,15,56,221,208
-.byte	102,15,56,221,216
-.byte	102,15,56,221,224
-.byte	102,15,56,221,232
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
+	aesenc	%xmm1,%xmm4
+	aesenc	%xmm1,%xmm5
+	aesenclast	%xmm0,%xmm2
+	aesenclast	%xmm0,%xmm3
+	aesenclast	%xmm0,%xmm4
+	aesenclast	%xmm0,%xmm5
 	ret
 
 
@@ -258,27 +258,27 @@
 	addq	$16,%rax
 
 L$dec_loop4:
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
+	aesdec	%xmm1,%xmm4
+	aesdec	%xmm1,%xmm5
 	movups	(%rcx,%rax,1),%xmm1
 	addq	$32,%rax
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-.byte	102,15,56,222,232
+	aesdec	%xmm0,%xmm2
+	aesdec	%xmm0,%xmm3
+	aesdec	%xmm0,%xmm4
+	aesdec	%xmm0,%xmm5
 	movups	-16(%rcx,%rax,1),%xmm0
 	jnz	L$dec_loop4
 
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,223,208
-.byte	102,15,56,223,216
-.byte	102,15,56,223,224
-.byte	102,15,56,223,232
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
+	aesdec	%xmm1,%xmm4
+	aesdec	%xmm1,%xmm5
+	aesdeclast	%xmm0,%xmm2
+	aesdeclast	%xmm0,%xmm3
+	aesdeclast	%xmm0,%xmm4
+	aesdeclast	%xmm0,%xmm5
 	ret
 
 
@@ -292,49 +292,49 @@
 	xorps	%xmm0,%xmm2
 	pxor	%xmm0,%xmm3
 	pxor	%xmm0,%xmm4
-.byte	102,15,56,220,209
+	aesenc	%xmm1,%xmm2
 	leaq	32(%rcx,%rax,1),%rcx
 	negq	%rax
-.byte	102,15,56,220,217
+	aesenc	%xmm1,%xmm3
 	pxor	%xmm0,%xmm5
 	pxor	%xmm0,%xmm6
-.byte	102,15,56,220,225
+	aesenc	%xmm1,%xmm4
 	pxor	%xmm0,%xmm7
 	movups	(%rcx,%rax,1),%xmm0
 	addq	$16,%rax
 	jmp	L$enc_loop6_enter
 .p2align	4
 L$enc_loop6:
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
+	aesenc	%xmm1,%xmm4
 L$enc_loop6_enter:
-.byte	102,15,56,220,233
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
+	aesenc	%xmm1,%xmm5
+	aesenc	%xmm1,%xmm6
+	aesenc	%xmm1,%xmm7
 	movups	(%rcx,%rax,1),%xmm1
 	addq	$32,%rax
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-.byte	102,15,56,220,224
-.byte	102,15,56,220,232
-.byte	102,15,56,220,240
-.byte	102,15,56,220,248
+	aesenc	%xmm0,%xmm2
+	aesenc	%xmm0,%xmm3
+	aesenc	%xmm0,%xmm4
+	aesenc	%xmm0,%xmm5
+	aesenc	%xmm0,%xmm6
+	aesenc	%xmm0,%xmm7
 	movups	-16(%rcx,%rax,1),%xmm0
 	jnz	L$enc_loop6
 
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,15,56,221,208
-.byte	102,15,56,221,216
-.byte	102,15,56,221,224
-.byte	102,15,56,221,232
-.byte	102,15,56,221,240
-.byte	102,15,56,221,248
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
+	aesenc	%xmm1,%xmm4
+	aesenc	%xmm1,%xmm5
+	aesenc	%xmm1,%xmm6
+	aesenc	%xmm1,%xmm7
+	aesenclast	%xmm0,%xmm2
+	aesenclast	%xmm0,%xmm3
+	aesenclast	%xmm0,%xmm4
+	aesenclast	%xmm0,%xmm5
+	aesenclast	%xmm0,%xmm6
+	aesenclast	%xmm0,%xmm7
 	ret
 
 
@@ -348,49 +348,49 @@
 	xorps	%xmm0,%xmm2
 	pxor	%xmm0,%xmm3
 	pxor	%xmm0,%xmm4
-.byte	102,15,56,222,209
+	aesdec	%xmm1,%xmm2
 	leaq	32(%rcx,%rax,1),%rcx
 	negq	%rax
-.byte	102,15,56,222,217
+	aesdec	%xmm1,%xmm3
 	pxor	%xmm0,%xmm5
 	pxor	%xmm0,%xmm6
-.byte	102,15,56,222,225
+	aesdec	%xmm1,%xmm4
 	pxor	%xmm0,%xmm7
 	movups	(%rcx,%rax,1),%xmm0
 	addq	$16,%rax
 	jmp	L$dec_loop6_enter
 .p2align	4
 L$dec_loop6:
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
+	aesdec	%xmm1,%xmm4
 L$dec_loop6_enter:
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
+	aesdec	%xmm1,%xmm5
+	aesdec	%xmm1,%xmm6
+	aesdec	%xmm1,%xmm7
 	movups	(%rcx,%rax,1),%xmm1
 	addq	$32,%rax
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-.byte	102,15,56,222,232
-.byte	102,15,56,222,240
-.byte	102,15,56,222,248
+	aesdec	%xmm0,%xmm2
+	aesdec	%xmm0,%xmm3
+	aesdec	%xmm0,%xmm4
+	aesdec	%xmm0,%xmm5
+	aesdec	%xmm0,%xmm6
+	aesdec	%xmm0,%xmm7
 	movups	-16(%rcx,%rax,1),%xmm0
 	jnz	L$dec_loop6
 
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-.byte	102,15,56,223,208
-.byte	102,15,56,223,216
-.byte	102,15,56,223,224
-.byte	102,15,56,223,232
-.byte	102,15,56,223,240
-.byte	102,15,56,223,248
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
+	aesdec	%xmm1,%xmm4
+	aesdec	%xmm1,%xmm5
+	aesdec	%xmm1,%xmm6
+	aesdec	%xmm1,%xmm7
+	aesdeclast	%xmm0,%xmm2
+	aesdeclast	%xmm0,%xmm3
+	aesdeclast	%xmm0,%xmm4
+	aesdeclast	%xmm0,%xmm5
+	aesdeclast	%xmm0,%xmm6
+	aesdeclast	%xmm0,%xmm7
 	ret
 
 
@@ -408,55 +408,55 @@
 	pxor	%xmm0,%xmm6
 	leaq	32(%rcx,%rax,1),%rcx
 	negq	%rax
-.byte	102,15,56,220,209
+	aesenc	%xmm1,%xmm2
 	pxor	%xmm0,%xmm7
 	pxor	%xmm0,%xmm8
-.byte	102,15,56,220,217
+	aesenc	%xmm1,%xmm3
 	pxor	%xmm0,%xmm9
 	movups	(%rcx,%rax,1),%xmm0
 	addq	$16,%rax
 	jmp	L$enc_loop8_inner
 .p2align	4
 L$enc_loop8:
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
 L$enc_loop8_inner:
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,68,15,56,220,193
-.byte	102,68,15,56,220,201
+	aesenc	%xmm1,%xmm4
+	aesenc	%xmm1,%xmm5
+	aesenc	%xmm1,%xmm6
+	aesenc	%xmm1,%xmm7
+	aesenc	%xmm1,%xmm8
+	aesenc	%xmm1,%xmm9
 L$enc_loop8_enter:
 	movups	(%rcx,%rax,1),%xmm1
 	addq	$32,%rax
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-.byte	102,15,56,220,224
-.byte	102,15,56,220,232
-.byte	102,15,56,220,240
-.byte	102,15,56,220,248
-.byte	102,68,15,56,220,192
-.byte	102,68,15,56,220,200
+	aesenc	%xmm0,%xmm2
+	aesenc	%xmm0,%xmm3
+	aesenc	%xmm0,%xmm4
+	aesenc	%xmm0,%xmm5
+	aesenc	%xmm0,%xmm6
+	aesenc	%xmm0,%xmm7
+	aesenc	%xmm0,%xmm8
+	aesenc	%xmm0,%xmm9
 	movups	-16(%rcx,%rax,1),%xmm0
 	jnz	L$enc_loop8
 
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,68,15,56,220,193
-.byte	102,68,15,56,220,201
-.byte	102,15,56,221,208
-.byte	102,15,56,221,216
-.byte	102,15,56,221,224
-.byte	102,15,56,221,232
-.byte	102,15,56,221,240
-.byte	102,15,56,221,248
-.byte	102,68,15,56,221,192
-.byte	102,68,15,56,221,200
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
+	aesenc	%xmm1,%xmm4
+	aesenc	%xmm1,%xmm5
+	aesenc	%xmm1,%xmm6
+	aesenc	%xmm1,%xmm7
+	aesenc	%xmm1,%xmm8
+	aesenc	%xmm1,%xmm9
+	aesenclast	%xmm0,%xmm2
+	aesenclast	%xmm0,%xmm3
+	aesenclast	%xmm0,%xmm4
+	aesenclast	%xmm0,%xmm5
+	aesenclast	%xmm0,%xmm6
+	aesenclast	%xmm0,%xmm7
+	aesenclast	%xmm0,%xmm8
+	aesenclast	%xmm0,%xmm9
 	ret
 
 
@@ -474,55 +474,55 @@
 	pxor	%xmm0,%xmm6
 	leaq	32(%rcx,%rax,1),%rcx
 	negq	%rax
-.byte	102,15,56,222,209
+	aesdec	%xmm1,%xmm2
 	pxor	%xmm0,%xmm7
 	pxor	%xmm0,%xmm8
-.byte	102,15,56,222,217
+	aesdec	%xmm1,%xmm3
 	pxor	%xmm0,%xmm9
 	movups	(%rcx,%rax,1),%xmm0
 	addq	$16,%rax
 	jmp	L$dec_loop8_inner
 .p2align	4
 L$dec_loop8:
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
 L$dec_loop8_inner:
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-.byte	102,68,15,56,222,193
-.byte	102,68,15,56,222,201
+	aesdec	%xmm1,%xmm4
+	aesdec	%xmm1,%xmm5
+	aesdec	%xmm1,%xmm6
+	aesdec	%xmm1,%xmm7
+	aesdec	%xmm1,%xmm8
+	aesdec	%xmm1,%xmm9
 L$dec_loop8_enter:
 	movups	(%rcx,%rax,1),%xmm1
 	addq	$32,%rax
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-.byte	102,15,56,222,232
-.byte	102,15,56,222,240
-.byte	102,15,56,222,248
-.byte	102,68,15,56,222,192
-.byte	102,68,15,56,222,200
+	aesdec	%xmm0,%xmm2
+	aesdec	%xmm0,%xmm3
+	aesdec	%xmm0,%xmm4
+	aesdec	%xmm0,%xmm5
+	aesdec	%xmm0,%xmm6
+	aesdec	%xmm0,%xmm7
+	aesdec	%xmm0,%xmm8
+	aesdec	%xmm0,%xmm9
 	movups	-16(%rcx,%rax,1),%xmm0
 	jnz	L$dec_loop8
 
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-.byte	102,68,15,56,222,193
-.byte	102,68,15,56,222,201
-.byte	102,15,56,223,208
-.byte	102,15,56,223,216
-.byte	102,15,56,223,224
-.byte	102,15,56,223,232
-.byte	102,15,56,223,240
-.byte	102,15,56,223,248
-.byte	102,68,15,56,223,192
-.byte	102,68,15,56,223,200
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
+	aesdec	%xmm1,%xmm4
+	aesdec	%xmm1,%xmm5
+	aesdec	%xmm1,%xmm6
+	aesdec	%xmm1,%xmm7
+	aesdec	%xmm1,%xmm8
+	aesdec	%xmm1,%xmm9
+	aesdeclast	%xmm0,%xmm2
+	aesdeclast	%xmm0,%xmm3
+	aesdeclast	%xmm0,%xmm4
+	aesdeclast	%xmm0,%xmm5
+	aesdeclast	%xmm0,%xmm6
+	aesdeclast	%xmm0,%xmm7
+	aesdeclast	%xmm0,%xmm8
+	aesdeclast	%xmm0,%xmm9
 	ret
 
 
@@ -634,12 +634,12 @@
 	leaq	32(%rcx),%rcx
 	xorps	%xmm0,%xmm2
 L$oop_enc1_3:
-.byte	102,15,56,220,209
+	aesenc	%xmm1,%xmm2
 	decl	%eax
 	movups	(%rcx),%xmm1
 	leaq	16(%rcx),%rcx
 	jnz	L$oop_enc1_3
-.byte	102,15,56,221,209
+	aesenclast	%xmm1,%xmm2
 	movups	%xmm2,(%rsi)
 	jmp	L$ecb_ret
 .p2align	4
@@ -795,12 +795,12 @@
 	leaq	32(%rcx),%rcx
 	xorps	%xmm0,%xmm2
 L$oop_dec1_4:
-.byte	102,15,56,222,209
+	aesdec	%xmm1,%xmm2
 	decl	%eax
 	movups	(%rcx),%xmm1
 	leaq	16(%rcx),%rcx
 	jnz	L$oop_dec1_4
-.byte	102,15,56,223,209
+	aesdeclast	%xmm1,%xmm2
 	movups	%xmm2,(%rsi)
 	pxor	%xmm2,%xmm2
 	jmp	L$ecb_ret
@@ -895,12 +895,12 @@
 	leaq	32(%rcx),%rcx
 	xorps	%xmm0,%xmm2
 L$oop_enc1_5:
-.byte	102,15,56,220,209
+	aesenc	%xmm1,%xmm2
 	decl	%edx
 	movups	(%rcx),%xmm1
 	leaq	16(%rcx),%rcx
 	jnz	L$oop_enc1_5
-.byte	102,15,56,221,209
+	aesenclast	%xmm1,%xmm2
 	pxor	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
 	xorps	%xmm3,%xmm2
@@ -943,17 +943,17 @@
 	bswapl	%edx
 	xorl	%ebp,%eax
 	xorl	%ebp,%edx
-.byte	102,15,58,34,216,3
+	pinsrd	$3,%eax,%xmm3
 	leaq	3(%r8),%rax
 	movdqa	%xmm3,16(%rsp)
-.byte	102,15,58,34,226,3
+	pinsrd	$3,%edx,%xmm4
 	bswapl	%eax
 	movq	%r10,%rdx
 	leaq	4(%r8),%r10
 	movdqa	%xmm4,32(%rsp)
 	xorl	%ebp,%eax
 	bswapl	%r10d
-.byte	102,15,58,34,232,3
+	pinsrd	$3,%eax,%xmm5
 	xorl	%ebp,%r10d
 	movdqa	%xmm5,48(%rsp)
 	leaq	5(%r8),%r9
@@ -987,163 +987,163 @@
 L$ctr32_loop8:
 	addl	$8,%r8d
 	movdqa	96(%rsp),%xmm8
-.byte	102,15,56,220,209
+	aesenc	%xmm1,%xmm2
 	movl	%r8d,%r9d
 	movdqa	112(%rsp),%xmm9
-.byte	102,15,56,220,217
+	aesenc	%xmm1,%xmm3
 	bswapl	%r9d
 	movups	32-128(%rcx),%xmm0
-.byte	102,15,56,220,225
+	aesenc	%xmm1,%xmm4
 	xorl	%ebp,%r9d
 	nop
-.byte	102,15,56,220,233
+	aesenc	%xmm1,%xmm5
 	movl	%r9d,0+12(%rsp)
 	leaq	1(%r8),%r9
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,68,15,56,220,193
-.byte	102,68,15,56,220,201
+	aesenc	%xmm1,%xmm6
+	aesenc	%xmm1,%xmm7
+	aesenc	%xmm1,%xmm8
+	aesenc	%xmm1,%xmm9
 	movups	48-128(%rcx),%xmm1
 	bswapl	%r9d
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
+	aesenc	%xmm0,%xmm2
+	aesenc	%xmm0,%xmm3
 	xorl	%ebp,%r9d
 .byte	0x66,0x90
-.byte	102,15,56,220,224
-.byte	102,15,56,220,232
+	aesenc	%xmm0,%xmm4
+	aesenc	%xmm0,%xmm5
 	movl	%r9d,16+12(%rsp)
 	leaq	2(%r8),%r9
-.byte	102,15,56,220,240
-.byte	102,15,56,220,248
-.byte	102,68,15,56,220,192
-.byte	102,68,15,56,220,200
+	aesenc	%xmm0,%xmm6
+	aesenc	%xmm0,%xmm7
+	aesenc	%xmm0,%xmm8
+	aesenc	%xmm0,%xmm9
 	movups	64-128(%rcx),%xmm0
 	bswapl	%r9d
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
 	xorl	%ebp,%r9d
 .byte	0x66,0x90
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
+	aesenc	%xmm1,%xmm4
+	aesenc	%xmm1,%xmm5
 	movl	%r9d,32+12(%rsp)
 	leaq	3(%r8),%r9
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,68,15,56,220,193
-.byte	102,68,15,56,220,201
+	aesenc	%xmm1,%xmm6
+	aesenc	%xmm1,%xmm7
+	aesenc	%xmm1,%xmm8
+	aesenc	%xmm1,%xmm9
 	movups	80-128(%rcx),%xmm1
 	bswapl	%r9d
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
+	aesenc	%xmm0,%xmm2
+	aesenc	%xmm0,%xmm3
 	xorl	%ebp,%r9d
 .byte	0x66,0x90
-.byte	102,15,56,220,224
-.byte	102,15,56,220,232
+	aesenc	%xmm0,%xmm4
+	aesenc	%xmm0,%xmm5
 	movl	%r9d,48+12(%rsp)
 	leaq	4(%r8),%r9
-.byte	102,15,56,220,240
-.byte	102,15,56,220,248
-.byte	102,68,15,56,220,192
-.byte	102,68,15,56,220,200
+	aesenc	%xmm0,%xmm6
+	aesenc	%xmm0,%xmm7
+	aesenc	%xmm0,%xmm8
+	aesenc	%xmm0,%xmm9
 	movups	96-128(%rcx),%xmm0
 	bswapl	%r9d
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
 	xorl	%ebp,%r9d
 .byte	0x66,0x90
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
+	aesenc	%xmm1,%xmm4
+	aesenc	%xmm1,%xmm5
 	movl	%r9d,64+12(%rsp)
 	leaq	5(%r8),%r9
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,68,15,56,220,193
-.byte	102,68,15,56,220,201
+	aesenc	%xmm1,%xmm6
+	aesenc	%xmm1,%xmm7
+	aesenc	%xmm1,%xmm8
+	aesenc	%xmm1,%xmm9
 	movups	112-128(%rcx),%xmm1
 	bswapl	%r9d
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
+	aesenc	%xmm0,%xmm2
+	aesenc	%xmm0,%xmm3
 	xorl	%ebp,%r9d
 .byte	0x66,0x90
-.byte	102,15,56,220,224
-.byte	102,15,56,220,232
+	aesenc	%xmm0,%xmm4
+	aesenc	%xmm0,%xmm5
 	movl	%r9d,80+12(%rsp)
 	leaq	6(%r8),%r9
-.byte	102,15,56,220,240
-.byte	102,15,56,220,248
-.byte	102,68,15,56,220,192
-.byte	102,68,15,56,220,200
+	aesenc	%xmm0,%xmm6
+	aesenc	%xmm0,%xmm7
+	aesenc	%xmm0,%xmm8
+	aesenc	%xmm0,%xmm9
 	movups	128-128(%rcx),%xmm0
 	bswapl	%r9d
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
 	xorl	%ebp,%r9d
 .byte	0x66,0x90
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
+	aesenc	%xmm1,%xmm4
+	aesenc	%xmm1,%xmm5
 	movl	%r9d,96+12(%rsp)
 	leaq	7(%r8),%r9
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,68,15,56,220,193
-.byte	102,68,15,56,220,201
+	aesenc	%xmm1,%xmm6
+	aesenc	%xmm1,%xmm7
+	aesenc	%xmm1,%xmm8
+	aesenc	%xmm1,%xmm9
 	movups	144-128(%rcx),%xmm1
 	bswapl	%r9d
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-.byte	102,15,56,220,224
+	aesenc	%xmm0,%xmm2
+	aesenc	%xmm0,%xmm3
+	aesenc	%xmm0,%xmm4
 	xorl	%ebp,%r9d
 	movdqu	0(%rdi),%xmm10
-.byte	102,15,56,220,232
+	aesenc	%xmm0,%xmm5
 	movl	%r9d,112+12(%rsp)
 	cmpl	$11,%eax
-.byte	102,15,56,220,240
-.byte	102,15,56,220,248
-.byte	102,68,15,56,220,192
-.byte	102,68,15,56,220,200
+	aesenc	%xmm0,%xmm6
+	aesenc	%xmm0,%xmm7
+	aesenc	%xmm0,%xmm8
+	aesenc	%xmm0,%xmm9
 	movups	160-128(%rcx),%xmm0
 
 	jb	L$ctr32_enc_done
 
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,68,15,56,220,193
-.byte	102,68,15,56,220,201
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
+	aesenc	%xmm1,%xmm4
+	aesenc	%xmm1,%xmm5
+	aesenc	%xmm1,%xmm6
+	aesenc	%xmm1,%xmm7
+	aesenc	%xmm1,%xmm8
+	aesenc	%xmm1,%xmm9
 	movups	176-128(%rcx),%xmm1
 
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-.byte	102,15,56,220,224
-.byte	102,15,56,220,232
-.byte	102,15,56,220,240
-.byte	102,15,56,220,248
-.byte	102,68,15,56,220,192
-.byte	102,68,15,56,220,200
+	aesenc	%xmm0,%xmm2
+	aesenc	%xmm0,%xmm3
+	aesenc	%xmm0,%xmm4
+	aesenc	%xmm0,%xmm5
+	aesenc	%xmm0,%xmm6
+	aesenc	%xmm0,%xmm7
+	aesenc	%xmm0,%xmm8
+	aesenc	%xmm0,%xmm9
 	movups	192-128(%rcx),%xmm0
 	je	L$ctr32_enc_done
 
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,68,15,56,220,193
-.byte	102,68,15,56,220,201
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
+	aesenc	%xmm1,%xmm4
+	aesenc	%xmm1,%xmm5
+	aesenc	%xmm1,%xmm6
+	aesenc	%xmm1,%xmm7
+	aesenc	%xmm1,%xmm8
+	aesenc	%xmm1,%xmm9
 	movups	208-128(%rcx),%xmm1
 
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-.byte	102,15,56,220,224
-.byte	102,15,56,220,232
-.byte	102,15,56,220,240
-.byte	102,15,56,220,248
-.byte	102,68,15,56,220,192
-.byte	102,68,15,56,220,200
+	aesenc	%xmm0,%xmm2
+	aesenc	%xmm0,%xmm3
+	aesenc	%xmm0,%xmm4
+	aesenc	%xmm0,%xmm5
+	aesenc	%xmm0,%xmm6
+	aesenc	%xmm0,%xmm7
+	aesenc	%xmm0,%xmm8
+	aesenc	%xmm0,%xmm9
 	movups	224-128(%rcx),%xmm0
 	jmp	L$ctr32_enc_done
 
@@ -1162,35 +1162,35 @@
 	prefetcht0	448(%rdi)
 	prefetcht0	512(%rdi)
 	pxor	%xmm0,%xmm15
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,68,15,56,220,193
-.byte	102,68,15,56,220,201
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
+	aesenc	%xmm1,%xmm4
+	aesenc	%xmm1,%xmm5
+	aesenc	%xmm1,%xmm6
+	aesenc	%xmm1,%xmm7
+	aesenc	%xmm1,%xmm8
+	aesenc	%xmm1,%xmm9
 	movdqu	96(%rdi),%xmm1
 	leaq	128(%rdi),%rdi
 
-.byte	102,65,15,56,221,210
+	aesenclast	%xmm10,%xmm2
 	pxor	%xmm0,%xmm1
 	movdqu	112-128(%rdi),%xmm10
-.byte	102,65,15,56,221,219
+	aesenclast	%xmm11,%xmm3
 	pxor	%xmm0,%xmm10
 	movdqa	0(%rsp),%xmm11
-.byte	102,65,15,56,221,228
-.byte	102,65,15,56,221,237
+	aesenclast	%xmm12,%xmm4
+	aesenclast	%xmm13,%xmm5
 	movdqa	16(%rsp),%xmm12
 	movdqa	32(%rsp),%xmm13
-.byte	102,65,15,56,221,246
-.byte	102,65,15,56,221,255
+	aesenclast	%xmm14,%xmm6
+	aesenclast	%xmm15,%xmm7
 	movdqa	48(%rsp),%xmm14
 	movdqa	64(%rsp),%xmm15
-.byte	102,68,15,56,221,193
+	aesenclast	%xmm1,%xmm8
 	movdqa	80(%rsp),%xmm0
 	movups	16-128(%rcx),%xmm1
-.byte	102,69,15,56,221,202
+	aesenclast	%xmm10,%xmm9
 
 	movups	%xmm2,(%rsi)
 	movdqa	%xmm11,%xmm2
@@ -1229,19 +1229,19 @@
 	pxor	%xmm9,%xmm9
 
 	movups	16(%rcx),%xmm0
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
 	leaq	32-16(%rcx,%rax,1),%rcx
 	negq	%rax
-.byte	102,15,56,220,225
+	aesenc	%xmm1,%xmm4
 	addq	$16,%rax
 	movups	(%rdi),%xmm10
-.byte	102,15,56,220,233
-.byte	102,15,56,220,241
+	aesenc	%xmm1,%xmm5
+	aesenc	%xmm1,%xmm6
 	movups	16(%rdi),%xmm11
 	movups	32(%rdi),%xmm12
-.byte	102,15,56,220,249
-.byte	102,68,15,56,220,193
+	aesenc	%xmm1,%xmm7
+	aesenc	%xmm1,%xmm8
 
 	call	L$enc_loop8_enter
 
@@ -1272,20 +1272,20 @@
 
 .p2align	5
 L$ctr32_loop4:
-.byte	102,15,56,220,209
+	aesenc	%xmm1,%xmm2
 	leaq	16(%rcx),%rcx
 	decl	%eax
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
+	aesenc	%xmm1,%xmm3
+	aesenc	%xmm1,%xmm4
+	aesenc	%xmm1,%xmm5
 	movups	(%rcx),%xmm1
 	jnz	L$ctr32_loop4
-.byte	102,15,56,221,209
-.byte	102,15,56,221,217
+	aesenclast	%xmm1,%xmm2
+	aesenclast	%xmm1,%xmm3
 	movups	(%rdi),%xmm10
 	movups	16(%rdi),%xmm11
-.byte	102,15,56,221,225
-.byte	102,15,56,221,233
+	aesenclast	%xmm1,%xmm4
+	aesenclast	%xmm1,%xmm5
 	movups	32(%rdi),%xmm12
 	movups	48(%rdi),%xmm13
 
@@ -1301,16 +1301,16 @@
 
 .p2align	5
 L$ctr32_loop3:
-.byte	102,15,56,220,209
+	aesenc	%xmm1,%xmm2
 	leaq	16(%rcx),%rcx
 	decl	%eax
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
+	aesenc	%xmm1,%xmm3
+	aesenc	%xmm1,%xmm4
 	movups	(%rcx),%xmm1
 	jnz	L$ctr32_loop3
-.byte	102,15,56,221,209
-.byte	102,15,56,221,217
-.byte	102,15,56,221,225
+	aesenclast	%xmm1,%xmm2
+	aesenclast	%xmm1,%xmm3
+	aesenclast	%xmm1,%xmm4
 
 	movups	(%rdi),%xmm10
 	xorps	%xmm10,%xmm2
@@ -1393,12 +1393,12 @@
 	leaq	32(%rcx),%rcx
 	xorps	%xmm3,%xmm2
 L$oop_enc1_6:
-.byte	102,15,56,220,209
+	aesenc	%xmm1,%xmm2
 	decl	%eax
 	movups	(%rcx),%xmm1
 	leaq	16(%rcx),%rcx
 	jnz	L$oop_enc1_6
-.byte	102,15,56,221,209
+	aesenclast	%xmm1,%xmm2
 	movl	%r10d,%eax
 	movq	%r11,%rcx
 	movups	%xmm2,0(%rsi)
@@ -1444,12 +1444,12 @@
 	leaq	32(%rcx),%rcx
 	xorps	%xmm0,%xmm2
 L$oop_dec1_7:
-.byte	102,15,56,222,209
+	aesdec	%xmm1,%xmm2
 	decl	%r10d
 	movups	(%rcx),%xmm1
 	leaq	16(%rcx),%rcx
 	jnz	L$oop_dec1_7
-.byte	102,15,56,223,209
+	aesdeclast	%xmm1,%xmm2
 	pxor	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
 	movdqu	%xmm4,(%r8)
@@ -1508,166 +1508,166 @@
 	pxor	%xmm0,%xmm7
 	pxor	%xmm0,%xmm8
 
-.byte	102,15,56,222,209
+	aesdec	%xmm1,%xmm2
 	pxor	%xmm0,%xmm9
 	movups	32-112(%rcx),%xmm0
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-.byte	102,68,15,56,222,193
+	aesdec	%xmm1,%xmm3
+	aesdec	%xmm1,%xmm4
+	aesdec	%xmm1,%xmm5
+	aesdec	%xmm1,%xmm6
+	aesdec	%xmm1,%xmm7
+	aesdec	%xmm1,%xmm8
 	adcq	$0,%rbp
 	andq	$128,%rbp
-.byte	102,68,15,56,222,201
+	aesdec	%xmm1,%xmm9
 	addq	%rdi,%rbp
 	movups	48-112(%rcx),%xmm1
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-.byte	102,15,56,222,232
-.byte	102,15,56,222,240
-.byte	102,15,56,222,248
-.byte	102,68,15,56,222,192
-.byte	102,68,15,56,222,200
+	aesdec	%xmm0,%xmm2
+	aesdec	%xmm0,%xmm3
+	aesdec	%xmm0,%xmm4
+	aesdec	%xmm0,%xmm5
+	aesdec	%xmm0,%xmm6
+	aesdec	%xmm0,%xmm7
+	aesdec	%xmm0,%xmm8
+	aesdec	%xmm0,%xmm9
 	movups	64-112(%rcx),%xmm0
 	nop
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-.byte	102,68,15,56,222,193
-.byte	102,68,15,56,222,201
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
+	aesdec	%xmm1,%xmm4
+	aesdec	%xmm1,%xmm5
+	aesdec	%xmm1,%xmm6
+	aesdec	%xmm1,%xmm7
+	aesdec	%xmm1,%xmm8
+	aesdec	%xmm1,%xmm9
 	movups	80-112(%rcx),%xmm1
 	nop
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-.byte	102,15,56,222,232
-.byte	102,15,56,222,240
-.byte	102,15,56,222,248
-.byte	102,68,15,56,222,192
-.byte	102,68,15,56,222,200
+	aesdec	%xmm0,%xmm2
+	aesdec	%xmm0,%xmm3
+	aesdec	%xmm0,%xmm4
+	aesdec	%xmm0,%xmm5
+	aesdec	%xmm0,%xmm6
+	aesdec	%xmm0,%xmm7
+	aesdec	%xmm0,%xmm8
+	aesdec	%xmm0,%xmm9
 	movups	96-112(%rcx),%xmm0
 	nop
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-.byte	102,68,15,56,222,193
-.byte	102,68,15,56,222,201
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
+	aesdec	%xmm1,%xmm4
+	aesdec	%xmm1,%xmm5
+	aesdec	%xmm1,%xmm6
+	aesdec	%xmm1,%xmm7
+	aesdec	%xmm1,%xmm8
+	aesdec	%xmm1,%xmm9
 	movups	112-112(%rcx),%xmm1
 	nop
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-.byte	102,15,56,222,232
-.byte	102,15,56,222,240
-.byte	102,15,56,222,248
-.byte	102,68,15,56,222,192
-.byte	102,68,15,56,222,200
+	aesdec	%xmm0,%xmm2
+	aesdec	%xmm0,%xmm3
+	aesdec	%xmm0,%xmm4
+	aesdec	%xmm0,%xmm5
+	aesdec	%xmm0,%xmm6
+	aesdec	%xmm0,%xmm7
+	aesdec	%xmm0,%xmm8
+	aesdec	%xmm0,%xmm9
 	movups	128-112(%rcx),%xmm0
 	nop
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-.byte	102,68,15,56,222,193
-.byte	102,68,15,56,222,201
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
+	aesdec	%xmm1,%xmm4
+	aesdec	%xmm1,%xmm5
+	aesdec	%xmm1,%xmm6
+	aesdec	%xmm1,%xmm7
+	aesdec	%xmm1,%xmm8
+	aesdec	%xmm1,%xmm9
 	movups	144-112(%rcx),%xmm1
 	cmpl	$11,%eax
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-.byte	102,15,56,222,232
-.byte	102,15,56,222,240
-.byte	102,15,56,222,248
-.byte	102,68,15,56,222,192
-.byte	102,68,15,56,222,200
+	aesdec	%xmm0,%xmm2
+	aesdec	%xmm0,%xmm3
+	aesdec	%xmm0,%xmm4
+	aesdec	%xmm0,%xmm5
+	aesdec	%xmm0,%xmm6
+	aesdec	%xmm0,%xmm7
+	aesdec	%xmm0,%xmm8
+	aesdec	%xmm0,%xmm9
 	movups	160-112(%rcx),%xmm0
 	jb	L$cbc_dec_done
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-.byte	102,68,15,56,222,193
-.byte	102,68,15,56,222,201
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
+	aesdec	%xmm1,%xmm4
+	aesdec	%xmm1,%xmm5
+	aesdec	%xmm1,%xmm6
+	aesdec	%xmm1,%xmm7
+	aesdec	%xmm1,%xmm8
+	aesdec	%xmm1,%xmm9
 	movups	176-112(%rcx),%xmm1
 	nop
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-.byte	102,15,56,222,232
-.byte	102,15,56,222,240
-.byte	102,15,56,222,248
-.byte	102,68,15,56,222,192
-.byte	102,68,15,56,222,200
+	aesdec	%xmm0,%xmm2
+	aesdec	%xmm0,%xmm3
+	aesdec	%xmm0,%xmm4
+	aesdec	%xmm0,%xmm5
+	aesdec	%xmm0,%xmm6
+	aesdec	%xmm0,%xmm7
+	aesdec	%xmm0,%xmm8
+	aesdec	%xmm0,%xmm9
 	movups	192-112(%rcx),%xmm0
 	je	L$cbc_dec_done
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-.byte	102,68,15,56,222,193
-.byte	102,68,15,56,222,201
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
+	aesdec	%xmm1,%xmm4
+	aesdec	%xmm1,%xmm5
+	aesdec	%xmm1,%xmm6
+	aesdec	%xmm1,%xmm7
+	aesdec	%xmm1,%xmm8
+	aesdec	%xmm1,%xmm9
 	movups	208-112(%rcx),%xmm1
 	nop
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-.byte	102,15,56,222,232
-.byte	102,15,56,222,240
-.byte	102,15,56,222,248
-.byte	102,68,15,56,222,192
-.byte	102,68,15,56,222,200
+	aesdec	%xmm0,%xmm2
+	aesdec	%xmm0,%xmm3
+	aesdec	%xmm0,%xmm4
+	aesdec	%xmm0,%xmm5
+	aesdec	%xmm0,%xmm6
+	aesdec	%xmm0,%xmm7
+	aesdec	%xmm0,%xmm8
+	aesdec	%xmm0,%xmm9
 	movups	224-112(%rcx),%xmm0
 	jmp	L$cbc_dec_done
 .p2align	4
 L$cbc_dec_done:
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
 	pxor	%xmm0,%xmm10
 	pxor	%xmm0,%xmm11
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
+	aesdec	%xmm1,%xmm4
+	aesdec	%xmm1,%xmm5
 	pxor	%xmm0,%xmm12
 	pxor	%xmm0,%xmm13
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
+	aesdec	%xmm1,%xmm6
+	aesdec	%xmm1,%xmm7
 	pxor	%xmm0,%xmm14
 	pxor	%xmm0,%xmm15
-.byte	102,68,15,56,222,193
-.byte	102,68,15,56,222,201
+	aesdec	%xmm1,%xmm8
+	aesdec	%xmm1,%xmm9
 	movdqu	80(%rdi),%xmm1
 
-.byte	102,65,15,56,223,210
+	aesdeclast	%xmm10,%xmm2
 	movdqu	96(%rdi),%xmm10
 	pxor	%xmm0,%xmm1
-.byte	102,65,15,56,223,219
+	aesdeclast	%xmm11,%xmm3
 	pxor	%xmm0,%xmm10
 	movdqu	112(%rdi),%xmm0
-.byte	102,65,15,56,223,228
+	aesdeclast	%xmm12,%xmm4
 	leaq	128(%rdi),%rdi
 	movdqu	0(%rbp),%xmm11
-.byte	102,65,15,56,223,237
-.byte	102,65,15,56,223,246
+	aesdeclast	%xmm13,%xmm5
+	aesdeclast	%xmm14,%xmm6
 	movdqu	16(%rbp),%xmm12
 	movdqu	32(%rbp),%xmm13
-.byte	102,65,15,56,223,255
-.byte	102,68,15,56,223,193
+	aesdeclast	%xmm15,%xmm7
+	aesdeclast	%xmm1,%xmm8
 	movdqu	48(%rbp),%xmm14
 	movdqu	64(%rbp),%xmm15
-.byte	102,69,15,56,223,202
+	aesdeclast	%xmm10,%xmm9
 	movdqa	%xmm0,%xmm10
 	movdqu	80(%rbp),%xmm1
 	movups	-112(%rcx),%xmm0
@@ -1811,12 +1811,12 @@
 	leaq	32(%rcx),%rcx
 	xorps	%xmm0,%xmm2
 L$oop_dec1_8:
-.byte	102,15,56,222,209
+	aesdec	%xmm1,%xmm2
 	decl	%eax
 	movups	(%rcx),%xmm1
 	leaq	16(%rcx),%rcx
 	jnz	L$oop_dec1_8
-.byte	102,15,56,223,209
+	aesdeclast	%xmm1,%xmm2
 	xorps	%xmm10,%xmm2
 	movaps	%xmm11,%xmm10
 	jmp	L$cbc_dec_tail_collected
@@ -1927,8 +1927,8 @@
 L$dec_key_inverse:
 	movups	(%rdi),%xmm0
 	movups	(%rdx),%xmm1
-.byte	102,15,56,219,192
-.byte	102,15,56,219,201
+	aesimc	%xmm0,%xmm0
+	aesimc	%xmm1,%xmm1
 	leaq	16(%rdi),%rdi
 	leaq	-16(%rdx),%rdx
 	movups	%xmm0,16(%rdx)
@@ -1937,7 +1937,7 @@
 	ja	L$dec_key_inverse
 
 	movups	(%rdi),%xmm0
-.byte	102,15,56,219,192
+	aesimc	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
 	movups	%xmm0,(%rdx)
 	pxor	%xmm0,%xmm0
@@ -1973,25 +1973,25 @@
 	movl	$9,%esi
 
 	movups	%xmm0,(%rdx)
-.byte	102,15,58,223,200,1
+	aeskeygenassist	$0x1,%xmm0,%xmm1
 	call	L$key_expansion_128_cold
-.byte	102,15,58,223,200,2
+	aeskeygenassist	$0x2,%xmm0,%xmm1
 	call	L$key_expansion_128
-.byte	102,15,58,223,200,4
+	aeskeygenassist	$0x4,%xmm0,%xmm1
 	call	L$key_expansion_128
-.byte	102,15,58,223,200,8
+	aeskeygenassist	$0x8,%xmm0,%xmm1
 	call	L$key_expansion_128
-.byte	102,15,58,223,200,16
+	aeskeygenassist	$0x10,%xmm0,%xmm1
 	call	L$key_expansion_128
-.byte	102,15,58,223,200,32
+	aeskeygenassist	$0x20,%xmm0,%xmm1
 	call	L$key_expansion_128
-.byte	102,15,58,223,200,64
+	aeskeygenassist	$0x40,%xmm0,%xmm1
 	call	L$key_expansion_128
-.byte	102,15,58,223,200,128
+	aeskeygenassist	$0x80,%xmm0,%xmm1
 	call	L$key_expansion_128
-.byte	102,15,58,223,200,27
+	aeskeygenassist	$0x1b,%xmm0,%xmm1
 	call	L$key_expansion_128
-.byte	102,15,58,223,200,54
+	aeskeygenassist	$0x36,%xmm0,%xmm1
 	call	L$key_expansion_128
 	movups	%xmm0,(%rax)
 	movl	%esi,80(%rax)
@@ -2004,21 +2004,21 @@
 	movl	$11,%esi
 
 	movups	%xmm0,(%rdx)
-.byte	102,15,58,223,202,1
+	aeskeygenassist	$0x1,%xmm2,%xmm1
 	call	L$key_expansion_192a_cold
-.byte	102,15,58,223,202,2
+	aeskeygenassist	$0x2,%xmm2,%xmm1
 	call	L$key_expansion_192b
-.byte	102,15,58,223,202,4
+	aeskeygenassist	$0x4,%xmm2,%xmm1
 	call	L$key_expansion_192a
-.byte	102,15,58,223,202,8
+	aeskeygenassist	$0x8,%xmm2,%xmm1
 	call	L$key_expansion_192b
-.byte	102,15,58,223,202,16
+	aeskeygenassist	$0x10,%xmm2,%xmm1
 	call	L$key_expansion_192a
-.byte	102,15,58,223,202,32
+	aeskeygenassist	$0x20,%xmm2,%xmm1
 	call	L$key_expansion_192b
-.byte	102,15,58,223,202,64
+	aeskeygenassist	$0x40,%xmm2,%xmm1
 	call	L$key_expansion_192a
-.byte	102,15,58,223,202,128
+	aeskeygenassist	$0x80,%xmm2,%xmm1
 	call	L$key_expansion_192b
 	movups	%xmm0,(%rax)
 	movl	%esi,48(%rax)
@@ -2033,31 +2033,31 @@
 
 	movups	%xmm0,(%rdx)
 	movups	%xmm2,16(%rdx)
-.byte	102,15,58,223,202,1
+	aeskeygenassist	$0x1,%xmm2,%xmm1
 	call	L$key_expansion_256a_cold
-.byte	102,15,58,223,200,1
+	aeskeygenassist	$0x1,%xmm0,%xmm1
 	call	L$key_expansion_256b
-.byte	102,15,58,223,202,2
+	aeskeygenassist	$0x2,%xmm2,%xmm1
 	call	L$key_expansion_256a
-.byte	102,15,58,223,200,2
+	aeskeygenassist	$0x2,%xmm0,%xmm1
 	call	L$key_expansion_256b
-.byte	102,15,58,223,202,4
+	aeskeygenassist	$0x4,%xmm2,%xmm1
 	call	L$key_expansion_256a
-.byte	102,15,58,223,200,4
+	aeskeygenassist	$0x4,%xmm0,%xmm1
 	call	L$key_expansion_256b
-.byte	102,15,58,223,202,8
+	aeskeygenassist	$0x8,%xmm2,%xmm1
 	call	L$key_expansion_256a
-.byte	102,15,58,223,200,8
+	aeskeygenassist	$0x8,%xmm0,%xmm1
 	call	L$key_expansion_256b
-.byte	102,15,58,223,202,16
+	aeskeygenassist	$0x10,%xmm2,%xmm1
 	call	L$key_expansion_256a
-.byte	102,15,58,223,200,16
+	aeskeygenassist	$0x10,%xmm0,%xmm1
 	call	L$key_expansion_256b
-.byte	102,15,58,223,202,32
+	aeskeygenassist	$0x20,%xmm2,%xmm1
 	call	L$key_expansion_256a
-.byte	102,15,58,223,200,32
+	aeskeygenassist	$0x20,%xmm0,%xmm1
 	call	L$key_expansion_256b
-.byte	102,15,58,223,202,64
+	aeskeygenassist	$0x40,%xmm2,%xmm1
 	call	L$key_expansion_256a
 	movups	%xmm0,(%rax)
 	movl	%esi,16(%rax)
@@ -2195,8 +2195,8 @@
 
 .p2align	4
 L$oop_key128:
-.byte	102,15,56,0,197
-.byte	102,15,56,221,196
+	pshufb	%xmm5,%xmm0
+	aesenclast	%xmm4,%xmm0
 	pslld	$1,%xmm4
 	leaq	16(%rax),%rax
 
@@ -2217,8 +2217,8 @@
 
 	movdqa	L$key_rcon1b(%rip),%xmm4
 
-.byte	102,15,56,0,197
-.byte	102,15,56,221,196
+	pshufb	%xmm5,%xmm0
+	aesenclast	%xmm4,%xmm0
 	pslld	$1,%xmm4
 
 	movdqa	%xmm2,%xmm3
@@ -2233,8 +2233,8 @@
 	movdqu	%xmm0,(%rax)
 
 	movdqa	%xmm0,%xmm2
-.byte	102,15,56,0,197
-.byte	102,15,56,221,196
+	pshufb	%xmm5,%xmm0
+	aesenclast	%xmm4,%xmm0
 
 	movdqa	%xmm2,%xmm3
 	pslldq	$4,%xmm2
@@ -2265,8 +2265,8 @@
 L$oop_key192:
 	movq	%xmm2,0(%rax)
 	movdqa	%xmm2,%xmm1
-.byte	102,15,56,0,213
-.byte	102,15,56,221,212
+	pshufb	%xmm5,%xmm2
+	aesenclast	%xmm4,%xmm2
 	pslld	$1,%xmm4
 	leaq	24(%rax),%rax
 
@@ -2309,8 +2309,8 @@
 
 .p2align	4
 L$oop_key256:
-.byte	102,15,56,0,213
-.byte	102,15,56,221,212
+	pshufb	%xmm5,%xmm2
+	aesenclast	%xmm4,%xmm2
 
 	movdqa	%xmm0,%xmm3
 	pslldq	$4,%xmm0
@@ -2329,7 +2329,7 @@
 
 	pshufd	$0xff,%xmm0,%xmm2
 	pxor	%xmm3,%xmm3
-.byte	102,15,56,221,211
+	aesenclast	%xmm3,%xmm2
 
 	movdqa	%xmm1,%xmm3
 	pslldq	$4,%xmm1
diff --git a/gen/bcm/aesni-x86_64-linux.S b/gen/bcm/aesni-x86_64-linux.S
index bedd98b..4bce582 100644
--- a/gen/bcm/aesni-x86_64-linux.S
+++ b/gen/bcm/aesni-x86_64-linux.S
@@ -24,12 +24,12 @@
 	leaq	32(%rdx),%rdx
 	xorps	%xmm0,%xmm2
 .Loop_enc1_1:
-.byte	102,15,56,220,209
+	aesenc	%xmm1,%xmm2
 	decl	%eax
 	movups	(%rdx),%xmm1
 	leaq	16(%rdx),%rdx
 	jnz	.Loop_enc1_1
-.byte	102,15,56,221,209
+	aesenclast	%xmm1,%xmm2
 	pxor	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
 	movups	%xmm2,(%rsi)
@@ -52,12 +52,12 @@
 	leaq	32(%rdx),%rdx
 	xorps	%xmm0,%xmm2
 .Loop_dec1_2:
-.byte	102,15,56,222,209
+	aesdec	%xmm1,%xmm2
 	decl	%eax
 	movups	(%rdx),%xmm1
 	leaq	16(%rdx),%rdx
 	jnz	.Loop_dec1_2
-.byte	102,15,56,223,209
+	aesdeclast	%xmm1,%xmm2
 	pxor	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
 	movups	%xmm2,(%rsi)
@@ -80,19 +80,19 @@
 	addq	$16,%rax
 
 .Lenc_loop2:
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
 	movups	(%rcx,%rax,1),%xmm1
 	addq	$32,%rax
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
+	aesenc	%xmm0,%xmm2
+	aesenc	%xmm0,%xmm3
 	movups	-16(%rcx,%rax,1),%xmm0
 	jnz	.Lenc_loop2
 
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,221,208
-.byte	102,15,56,221,216
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
+	aesenclast	%xmm0,%xmm2
+	aesenclast	%xmm0,%xmm3
 	ret
 .cfi_endproc	
 .size	_aesni_encrypt2,.-_aesni_encrypt2
@@ -111,19 +111,19 @@
 	addq	$16,%rax
 
 .Ldec_loop2:
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
 	movups	(%rcx,%rax,1),%xmm1
 	addq	$32,%rax
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
+	aesdec	%xmm0,%xmm2
+	aesdec	%xmm0,%xmm3
 	movups	-16(%rcx,%rax,1),%xmm0
 	jnz	.Ldec_loop2
 
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,223,208
-.byte	102,15,56,223,216
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
+	aesdeclast	%xmm0,%xmm2
+	aesdeclast	%xmm0,%xmm3
 	ret
 .cfi_endproc	
 .size	_aesni_decrypt2,.-_aesni_decrypt2
@@ -143,23 +143,23 @@
 	addq	$16,%rax
 
 .Lenc_loop3:
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
+	aesenc	%xmm1,%xmm4
 	movups	(%rcx,%rax,1),%xmm1
 	addq	$32,%rax
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-.byte	102,15,56,220,224
+	aesenc	%xmm0,%xmm2
+	aesenc	%xmm0,%xmm3
+	aesenc	%xmm0,%xmm4
 	movups	-16(%rcx,%rax,1),%xmm0
 	jnz	.Lenc_loop3
 
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,221,208
-.byte	102,15,56,221,216
-.byte	102,15,56,221,224
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
+	aesenc	%xmm1,%xmm4
+	aesenclast	%xmm0,%xmm2
+	aesenclast	%xmm0,%xmm3
+	aesenclast	%xmm0,%xmm4
 	ret
 .cfi_endproc	
 .size	_aesni_encrypt3,.-_aesni_encrypt3
@@ -179,23 +179,23 @@
 	addq	$16,%rax
 
 .Ldec_loop3:
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
+	aesdec	%xmm1,%xmm4
 	movups	(%rcx,%rax,1),%xmm1
 	addq	$32,%rax
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
+	aesdec	%xmm0,%xmm2
+	aesdec	%xmm0,%xmm3
+	aesdec	%xmm0,%xmm4
 	movups	-16(%rcx,%rax,1),%xmm0
 	jnz	.Ldec_loop3
 
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,223,208
-.byte	102,15,56,223,216
-.byte	102,15,56,223,224
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
+	aesdec	%xmm1,%xmm4
+	aesdeclast	%xmm0,%xmm2
+	aesdeclast	%xmm0,%xmm3
+	aesdeclast	%xmm0,%xmm4
 	ret
 .cfi_endproc	
 .size	_aesni_decrypt3,.-_aesni_decrypt3
@@ -217,27 +217,27 @@
 	addq	$16,%rax
 
 .Lenc_loop4:
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
+	aesenc	%xmm1,%xmm4
+	aesenc	%xmm1,%xmm5
 	movups	(%rcx,%rax,1),%xmm1
 	addq	$32,%rax
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-.byte	102,15,56,220,224
-.byte	102,15,56,220,232
+	aesenc	%xmm0,%xmm2
+	aesenc	%xmm0,%xmm3
+	aesenc	%xmm0,%xmm4
+	aesenc	%xmm0,%xmm5
 	movups	-16(%rcx,%rax,1),%xmm0
 	jnz	.Lenc_loop4
 
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-.byte	102,15,56,221,208
-.byte	102,15,56,221,216
-.byte	102,15,56,221,224
-.byte	102,15,56,221,232
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
+	aesenc	%xmm1,%xmm4
+	aesenc	%xmm1,%xmm5
+	aesenclast	%xmm0,%xmm2
+	aesenclast	%xmm0,%xmm3
+	aesenclast	%xmm0,%xmm4
+	aesenclast	%xmm0,%xmm5
 	ret
 .cfi_endproc	
 .size	_aesni_encrypt4,.-_aesni_encrypt4
@@ -259,27 +259,27 @@
 	addq	$16,%rax
 
 .Ldec_loop4:
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
+	aesdec	%xmm1,%xmm4
+	aesdec	%xmm1,%xmm5
 	movups	(%rcx,%rax,1),%xmm1
 	addq	$32,%rax
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-.byte	102,15,56,222,232
+	aesdec	%xmm0,%xmm2
+	aesdec	%xmm0,%xmm3
+	aesdec	%xmm0,%xmm4
+	aesdec	%xmm0,%xmm5
 	movups	-16(%rcx,%rax,1),%xmm0
 	jnz	.Ldec_loop4
 
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,223,208
-.byte	102,15,56,223,216
-.byte	102,15,56,223,224
-.byte	102,15,56,223,232
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
+	aesdec	%xmm1,%xmm4
+	aesdec	%xmm1,%xmm5
+	aesdeclast	%xmm0,%xmm2
+	aesdeclast	%xmm0,%xmm3
+	aesdeclast	%xmm0,%xmm4
+	aesdeclast	%xmm0,%xmm5
 	ret
 .cfi_endproc	
 .size	_aesni_decrypt4,.-_aesni_decrypt4
@@ -293,49 +293,49 @@
 	xorps	%xmm0,%xmm2
 	pxor	%xmm0,%xmm3
 	pxor	%xmm0,%xmm4
-.byte	102,15,56,220,209
+	aesenc	%xmm1,%xmm2
 	leaq	32(%rcx,%rax,1),%rcx
 	negq	%rax
-.byte	102,15,56,220,217
+	aesenc	%xmm1,%xmm3
 	pxor	%xmm0,%xmm5
 	pxor	%xmm0,%xmm6
-.byte	102,15,56,220,225
+	aesenc	%xmm1,%xmm4
 	pxor	%xmm0,%xmm7
 	movups	(%rcx,%rax,1),%xmm0
 	addq	$16,%rax
 	jmp	.Lenc_loop6_enter
 .align	16
 .Lenc_loop6:
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
+	aesenc	%xmm1,%xmm4
 .Lenc_loop6_enter:
-.byte	102,15,56,220,233
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
+	aesenc	%xmm1,%xmm5
+	aesenc	%xmm1,%xmm6
+	aesenc	%xmm1,%xmm7
 	movups	(%rcx,%rax,1),%xmm1
 	addq	$32,%rax
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-.byte	102,15,56,220,224
-.byte	102,15,56,220,232
-.byte	102,15,56,220,240
-.byte	102,15,56,220,248
+	aesenc	%xmm0,%xmm2
+	aesenc	%xmm0,%xmm3
+	aesenc	%xmm0,%xmm4
+	aesenc	%xmm0,%xmm5
+	aesenc	%xmm0,%xmm6
+	aesenc	%xmm0,%xmm7
 	movups	-16(%rcx,%rax,1),%xmm0
 	jnz	.Lenc_loop6
 
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,15,56,221,208
-.byte	102,15,56,221,216
-.byte	102,15,56,221,224
-.byte	102,15,56,221,232
-.byte	102,15,56,221,240
-.byte	102,15,56,221,248
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
+	aesenc	%xmm1,%xmm4
+	aesenc	%xmm1,%xmm5
+	aesenc	%xmm1,%xmm6
+	aesenc	%xmm1,%xmm7
+	aesenclast	%xmm0,%xmm2
+	aesenclast	%xmm0,%xmm3
+	aesenclast	%xmm0,%xmm4
+	aesenclast	%xmm0,%xmm5
+	aesenclast	%xmm0,%xmm6
+	aesenclast	%xmm0,%xmm7
 	ret
 .cfi_endproc	
 .size	_aesni_encrypt6,.-_aesni_encrypt6
@@ -349,49 +349,49 @@
 	xorps	%xmm0,%xmm2
 	pxor	%xmm0,%xmm3
 	pxor	%xmm0,%xmm4
-.byte	102,15,56,222,209
+	aesdec	%xmm1,%xmm2
 	leaq	32(%rcx,%rax,1),%rcx
 	negq	%rax
-.byte	102,15,56,222,217
+	aesdec	%xmm1,%xmm3
 	pxor	%xmm0,%xmm5
 	pxor	%xmm0,%xmm6
-.byte	102,15,56,222,225
+	aesdec	%xmm1,%xmm4
 	pxor	%xmm0,%xmm7
 	movups	(%rcx,%rax,1),%xmm0
 	addq	$16,%rax
 	jmp	.Ldec_loop6_enter
 .align	16
 .Ldec_loop6:
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
+	aesdec	%xmm1,%xmm4
 .Ldec_loop6_enter:
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
+	aesdec	%xmm1,%xmm5
+	aesdec	%xmm1,%xmm6
+	aesdec	%xmm1,%xmm7
 	movups	(%rcx,%rax,1),%xmm1
 	addq	$32,%rax
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-.byte	102,15,56,222,232
-.byte	102,15,56,222,240
-.byte	102,15,56,222,248
+	aesdec	%xmm0,%xmm2
+	aesdec	%xmm0,%xmm3
+	aesdec	%xmm0,%xmm4
+	aesdec	%xmm0,%xmm5
+	aesdec	%xmm0,%xmm6
+	aesdec	%xmm0,%xmm7
 	movups	-16(%rcx,%rax,1),%xmm0
 	jnz	.Ldec_loop6
 
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-.byte	102,15,56,223,208
-.byte	102,15,56,223,216
-.byte	102,15,56,223,224
-.byte	102,15,56,223,232
-.byte	102,15,56,223,240
-.byte	102,15,56,223,248
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
+	aesdec	%xmm1,%xmm4
+	aesdec	%xmm1,%xmm5
+	aesdec	%xmm1,%xmm6
+	aesdec	%xmm1,%xmm7
+	aesdeclast	%xmm0,%xmm2
+	aesdeclast	%xmm0,%xmm3
+	aesdeclast	%xmm0,%xmm4
+	aesdeclast	%xmm0,%xmm5
+	aesdeclast	%xmm0,%xmm6
+	aesdeclast	%xmm0,%xmm7
 	ret
 .cfi_endproc	
 .size	_aesni_decrypt6,.-_aesni_decrypt6
@@ -409,55 +409,55 @@
 	pxor	%xmm0,%xmm6
 	leaq	32(%rcx,%rax,1),%rcx
 	negq	%rax
-.byte	102,15,56,220,209
+	aesenc	%xmm1,%xmm2
 	pxor	%xmm0,%xmm7
 	pxor	%xmm0,%xmm8
-.byte	102,15,56,220,217
+	aesenc	%xmm1,%xmm3
 	pxor	%xmm0,%xmm9
 	movups	(%rcx,%rax,1),%xmm0
 	addq	$16,%rax
 	jmp	.Lenc_loop8_inner
 .align	16
 .Lenc_loop8:
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
 .Lenc_loop8_inner:
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,68,15,56,220,193
-.byte	102,68,15,56,220,201
+	aesenc	%xmm1,%xmm4
+	aesenc	%xmm1,%xmm5
+	aesenc	%xmm1,%xmm6
+	aesenc	%xmm1,%xmm7
+	aesenc	%xmm1,%xmm8
+	aesenc	%xmm1,%xmm9
 .Lenc_loop8_enter:
 	movups	(%rcx,%rax,1),%xmm1
 	addq	$32,%rax
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-.byte	102,15,56,220,224
-.byte	102,15,56,220,232
-.byte	102,15,56,220,240
-.byte	102,15,56,220,248
-.byte	102,68,15,56,220,192
-.byte	102,68,15,56,220,200
+	aesenc	%xmm0,%xmm2
+	aesenc	%xmm0,%xmm3
+	aesenc	%xmm0,%xmm4
+	aesenc	%xmm0,%xmm5
+	aesenc	%xmm0,%xmm6
+	aesenc	%xmm0,%xmm7
+	aesenc	%xmm0,%xmm8
+	aesenc	%xmm0,%xmm9
 	movups	-16(%rcx,%rax,1),%xmm0
 	jnz	.Lenc_loop8
 
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,68,15,56,220,193
-.byte	102,68,15,56,220,201
-.byte	102,15,56,221,208
-.byte	102,15,56,221,216
-.byte	102,15,56,221,224
-.byte	102,15,56,221,232
-.byte	102,15,56,221,240
-.byte	102,15,56,221,248
-.byte	102,68,15,56,221,192
-.byte	102,68,15,56,221,200
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
+	aesenc	%xmm1,%xmm4
+	aesenc	%xmm1,%xmm5
+	aesenc	%xmm1,%xmm6
+	aesenc	%xmm1,%xmm7
+	aesenc	%xmm1,%xmm8
+	aesenc	%xmm1,%xmm9
+	aesenclast	%xmm0,%xmm2
+	aesenclast	%xmm0,%xmm3
+	aesenclast	%xmm0,%xmm4
+	aesenclast	%xmm0,%xmm5
+	aesenclast	%xmm0,%xmm6
+	aesenclast	%xmm0,%xmm7
+	aesenclast	%xmm0,%xmm8
+	aesenclast	%xmm0,%xmm9
 	ret
 .cfi_endproc	
 .size	_aesni_encrypt8,.-_aesni_encrypt8
@@ -475,55 +475,55 @@
 	pxor	%xmm0,%xmm6
 	leaq	32(%rcx,%rax,1),%rcx
 	negq	%rax
-.byte	102,15,56,222,209
+	aesdec	%xmm1,%xmm2
 	pxor	%xmm0,%xmm7
 	pxor	%xmm0,%xmm8
-.byte	102,15,56,222,217
+	aesdec	%xmm1,%xmm3
 	pxor	%xmm0,%xmm9
 	movups	(%rcx,%rax,1),%xmm0
 	addq	$16,%rax
 	jmp	.Ldec_loop8_inner
 .align	16
 .Ldec_loop8:
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
 .Ldec_loop8_inner:
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-.byte	102,68,15,56,222,193
-.byte	102,68,15,56,222,201
+	aesdec	%xmm1,%xmm4
+	aesdec	%xmm1,%xmm5
+	aesdec	%xmm1,%xmm6
+	aesdec	%xmm1,%xmm7
+	aesdec	%xmm1,%xmm8
+	aesdec	%xmm1,%xmm9
 .Ldec_loop8_enter:
 	movups	(%rcx,%rax,1),%xmm1
 	addq	$32,%rax
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-.byte	102,15,56,222,232
-.byte	102,15,56,222,240
-.byte	102,15,56,222,248
-.byte	102,68,15,56,222,192
-.byte	102,68,15,56,222,200
+	aesdec	%xmm0,%xmm2
+	aesdec	%xmm0,%xmm3
+	aesdec	%xmm0,%xmm4
+	aesdec	%xmm0,%xmm5
+	aesdec	%xmm0,%xmm6
+	aesdec	%xmm0,%xmm7
+	aesdec	%xmm0,%xmm8
+	aesdec	%xmm0,%xmm9
 	movups	-16(%rcx,%rax,1),%xmm0
 	jnz	.Ldec_loop8
 
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-.byte	102,68,15,56,222,193
-.byte	102,68,15,56,222,201
-.byte	102,15,56,223,208
-.byte	102,15,56,223,216
-.byte	102,15,56,223,224
-.byte	102,15,56,223,232
-.byte	102,15,56,223,240
-.byte	102,15,56,223,248
-.byte	102,68,15,56,223,192
-.byte	102,68,15,56,223,200
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
+	aesdec	%xmm1,%xmm4
+	aesdec	%xmm1,%xmm5
+	aesdec	%xmm1,%xmm6
+	aesdec	%xmm1,%xmm7
+	aesdec	%xmm1,%xmm8
+	aesdec	%xmm1,%xmm9
+	aesdeclast	%xmm0,%xmm2
+	aesdeclast	%xmm0,%xmm3
+	aesdeclast	%xmm0,%xmm4
+	aesdeclast	%xmm0,%xmm5
+	aesdeclast	%xmm0,%xmm6
+	aesdeclast	%xmm0,%xmm7
+	aesdeclast	%xmm0,%xmm8
+	aesdeclast	%xmm0,%xmm9
 	ret
 .cfi_endproc	
 .size	_aesni_decrypt8,.-_aesni_decrypt8
@@ -635,12 +635,12 @@
 	leaq	32(%rcx),%rcx
 	xorps	%xmm0,%xmm2
 .Loop_enc1_3:
-.byte	102,15,56,220,209
+	aesenc	%xmm1,%xmm2
 	decl	%eax
 	movups	(%rcx),%xmm1
 	leaq	16(%rcx),%rcx
 	jnz	.Loop_enc1_3
-.byte	102,15,56,221,209
+	aesenclast	%xmm1,%xmm2
 	movups	%xmm2,(%rsi)
 	jmp	.Lecb_ret
 .align	16
@@ -796,12 +796,12 @@
 	leaq	32(%rcx),%rcx
 	xorps	%xmm0,%xmm2
 .Loop_dec1_4:
-.byte	102,15,56,222,209
+	aesdec	%xmm1,%xmm2
 	decl	%eax
 	movups	(%rcx),%xmm1
 	leaq	16(%rcx),%rcx
 	jnz	.Loop_dec1_4
-.byte	102,15,56,223,209
+	aesdeclast	%xmm1,%xmm2
 	movups	%xmm2,(%rsi)
 	pxor	%xmm2,%xmm2
 	jmp	.Lecb_ret
@@ -896,12 +896,12 @@
 	leaq	32(%rcx),%rcx
 	xorps	%xmm0,%xmm2
 .Loop_enc1_5:
-.byte	102,15,56,220,209
+	aesenc	%xmm1,%xmm2
 	decl	%edx
 	movups	(%rcx),%xmm1
 	leaq	16(%rcx),%rcx
 	jnz	.Loop_enc1_5
-.byte	102,15,56,221,209
+	aesenclast	%xmm1,%xmm2
 	pxor	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
 	xorps	%xmm3,%xmm2
@@ -944,17 +944,17 @@
 	bswapl	%edx
 	xorl	%ebp,%eax
 	xorl	%ebp,%edx
-.byte	102,15,58,34,216,3
+	pinsrd	$3,%eax,%xmm3
 	leaq	3(%r8),%rax
 	movdqa	%xmm3,16(%rsp)
-.byte	102,15,58,34,226,3
+	pinsrd	$3,%edx,%xmm4
 	bswapl	%eax
 	movq	%r10,%rdx
 	leaq	4(%r8),%r10
 	movdqa	%xmm4,32(%rsp)
 	xorl	%ebp,%eax
 	bswapl	%r10d
-.byte	102,15,58,34,232,3
+	pinsrd	$3,%eax,%xmm5
 	xorl	%ebp,%r10d
 	movdqa	%xmm5,48(%rsp)
 	leaq	5(%r8),%r9
@@ -988,163 +988,163 @@
 .Lctr32_loop8:
 	addl	$8,%r8d
 	movdqa	96(%rsp),%xmm8
-.byte	102,15,56,220,209
+	aesenc	%xmm1,%xmm2
 	movl	%r8d,%r9d
 	movdqa	112(%rsp),%xmm9
-.byte	102,15,56,220,217
+	aesenc	%xmm1,%xmm3
 	bswapl	%r9d
 	movups	32-128(%rcx),%xmm0
-.byte	102,15,56,220,225
+	aesenc	%xmm1,%xmm4
 	xorl	%ebp,%r9d
 	nop
-.byte	102,15,56,220,233
+	aesenc	%xmm1,%xmm5
 	movl	%r9d,0+12(%rsp)
 	leaq	1(%r8),%r9
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,68,15,56,220,193
-.byte	102,68,15,56,220,201
+	aesenc	%xmm1,%xmm6
+	aesenc	%xmm1,%xmm7
+	aesenc	%xmm1,%xmm8
+	aesenc	%xmm1,%xmm9
 	movups	48-128(%rcx),%xmm1
 	bswapl	%r9d
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
+	aesenc	%xmm0,%xmm2
+	aesenc	%xmm0,%xmm3
 	xorl	%ebp,%r9d
 .byte	0x66,0x90
-.byte	102,15,56,220,224
-.byte	102,15,56,220,232
+	aesenc	%xmm0,%xmm4
+	aesenc	%xmm0,%xmm5
 	movl	%r9d,16+12(%rsp)
 	leaq	2(%r8),%r9
-.byte	102,15,56,220,240
-.byte	102,15,56,220,248
-.byte	102,68,15,56,220,192
-.byte	102,68,15,56,220,200
+	aesenc	%xmm0,%xmm6
+	aesenc	%xmm0,%xmm7
+	aesenc	%xmm0,%xmm8
+	aesenc	%xmm0,%xmm9
 	movups	64-128(%rcx),%xmm0
 	bswapl	%r9d
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
 	xorl	%ebp,%r9d
 .byte	0x66,0x90
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
+	aesenc	%xmm1,%xmm4
+	aesenc	%xmm1,%xmm5
 	movl	%r9d,32+12(%rsp)
 	leaq	3(%r8),%r9
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,68,15,56,220,193
-.byte	102,68,15,56,220,201
+	aesenc	%xmm1,%xmm6
+	aesenc	%xmm1,%xmm7
+	aesenc	%xmm1,%xmm8
+	aesenc	%xmm1,%xmm9
 	movups	80-128(%rcx),%xmm1
 	bswapl	%r9d
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
+	aesenc	%xmm0,%xmm2
+	aesenc	%xmm0,%xmm3
 	xorl	%ebp,%r9d
 .byte	0x66,0x90
-.byte	102,15,56,220,224
-.byte	102,15,56,220,232
+	aesenc	%xmm0,%xmm4
+	aesenc	%xmm0,%xmm5
 	movl	%r9d,48+12(%rsp)
 	leaq	4(%r8),%r9
-.byte	102,15,56,220,240
-.byte	102,15,56,220,248
-.byte	102,68,15,56,220,192
-.byte	102,68,15,56,220,200
+	aesenc	%xmm0,%xmm6
+	aesenc	%xmm0,%xmm7
+	aesenc	%xmm0,%xmm8
+	aesenc	%xmm0,%xmm9
 	movups	96-128(%rcx),%xmm0
 	bswapl	%r9d
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
 	xorl	%ebp,%r9d
 .byte	0x66,0x90
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
+	aesenc	%xmm1,%xmm4
+	aesenc	%xmm1,%xmm5
 	movl	%r9d,64+12(%rsp)
 	leaq	5(%r8),%r9
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,68,15,56,220,193
-.byte	102,68,15,56,220,201
+	aesenc	%xmm1,%xmm6
+	aesenc	%xmm1,%xmm7
+	aesenc	%xmm1,%xmm8
+	aesenc	%xmm1,%xmm9
 	movups	112-128(%rcx),%xmm1
 	bswapl	%r9d
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
+	aesenc	%xmm0,%xmm2
+	aesenc	%xmm0,%xmm3
 	xorl	%ebp,%r9d
 .byte	0x66,0x90
-.byte	102,15,56,220,224
-.byte	102,15,56,220,232
+	aesenc	%xmm0,%xmm4
+	aesenc	%xmm0,%xmm5
 	movl	%r9d,80+12(%rsp)
 	leaq	6(%r8),%r9
-.byte	102,15,56,220,240
-.byte	102,15,56,220,248
-.byte	102,68,15,56,220,192
-.byte	102,68,15,56,220,200
+	aesenc	%xmm0,%xmm6
+	aesenc	%xmm0,%xmm7
+	aesenc	%xmm0,%xmm8
+	aesenc	%xmm0,%xmm9
 	movups	128-128(%rcx),%xmm0
 	bswapl	%r9d
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
 	xorl	%ebp,%r9d
 .byte	0x66,0x90
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
+	aesenc	%xmm1,%xmm4
+	aesenc	%xmm1,%xmm5
 	movl	%r9d,96+12(%rsp)
 	leaq	7(%r8),%r9
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,68,15,56,220,193
-.byte	102,68,15,56,220,201
+	aesenc	%xmm1,%xmm6
+	aesenc	%xmm1,%xmm7
+	aesenc	%xmm1,%xmm8
+	aesenc	%xmm1,%xmm9
 	movups	144-128(%rcx),%xmm1
 	bswapl	%r9d
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-.byte	102,15,56,220,224
+	aesenc	%xmm0,%xmm2
+	aesenc	%xmm0,%xmm3
+	aesenc	%xmm0,%xmm4
 	xorl	%ebp,%r9d
 	movdqu	0(%rdi),%xmm10
-.byte	102,15,56,220,232
+	aesenc	%xmm0,%xmm5
 	movl	%r9d,112+12(%rsp)
 	cmpl	$11,%eax
-.byte	102,15,56,220,240
-.byte	102,15,56,220,248
-.byte	102,68,15,56,220,192
-.byte	102,68,15,56,220,200
+	aesenc	%xmm0,%xmm6
+	aesenc	%xmm0,%xmm7
+	aesenc	%xmm0,%xmm8
+	aesenc	%xmm0,%xmm9
 	movups	160-128(%rcx),%xmm0
 
 	jb	.Lctr32_enc_done
 
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,68,15,56,220,193
-.byte	102,68,15,56,220,201
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
+	aesenc	%xmm1,%xmm4
+	aesenc	%xmm1,%xmm5
+	aesenc	%xmm1,%xmm6
+	aesenc	%xmm1,%xmm7
+	aesenc	%xmm1,%xmm8
+	aesenc	%xmm1,%xmm9
 	movups	176-128(%rcx),%xmm1
 
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-.byte	102,15,56,220,224
-.byte	102,15,56,220,232
-.byte	102,15,56,220,240
-.byte	102,15,56,220,248
-.byte	102,68,15,56,220,192
-.byte	102,68,15,56,220,200
+	aesenc	%xmm0,%xmm2
+	aesenc	%xmm0,%xmm3
+	aesenc	%xmm0,%xmm4
+	aesenc	%xmm0,%xmm5
+	aesenc	%xmm0,%xmm6
+	aesenc	%xmm0,%xmm7
+	aesenc	%xmm0,%xmm8
+	aesenc	%xmm0,%xmm9
 	movups	192-128(%rcx),%xmm0
 	je	.Lctr32_enc_done
 
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,68,15,56,220,193
-.byte	102,68,15,56,220,201
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
+	aesenc	%xmm1,%xmm4
+	aesenc	%xmm1,%xmm5
+	aesenc	%xmm1,%xmm6
+	aesenc	%xmm1,%xmm7
+	aesenc	%xmm1,%xmm8
+	aesenc	%xmm1,%xmm9
 	movups	208-128(%rcx),%xmm1
 
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-.byte	102,15,56,220,224
-.byte	102,15,56,220,232
-.byte	102,15,56,220,240
-.byte	102,15,56,220,248
-.byte	102,68,15,56,220,192
-.byte	102,68,15,56,220,200
+	aesenc	%xmm0,%xmm2
+	aesenc	%xmm0,%xmm3
+	aesenc	%xmm0,%xmm4
+	aesenc	%xmm0,%xmm5
+	aesenc	%xmm0,%xmm6
+	aesenc	%xmm0,%xmm7
+	aesenc	%xmm0,%xmm8
+	aesenc	%xmm0,%xmm9
 	movups	224-128(%rcx),%xmm0
 	jmp	.Lctr32_enc_done
 
@@ -1163,35 +1163,35 @@
 	prefetcht0	448(%rdi)
 	prefetcht0	512(%rdi)
 	pxor	%xmm0,%xmm15
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,68,15,56,220,193
-.byte	102,68,15,56,220,201
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
+	aesenc	%xmm1,%xmm4
+	aesenc	%xmm1,%xmm5
+	aesenc	%xmm1,%xmm6
+	aesenc	%xmm1,%xmm7
+	aesenc	%xmm1,%xmm8
+	aesenc	%xmm1,%xmm9
 	movdqu	96(%rdi),%xmm1
 	leaq	128(%rdi),%rdi
 
-.byte	102,65,15,56,221,210
+	aesenclast	%xmm10,%xmm2
 	pxor	%xmm0,%xmm1
 	movdqu	112-128(%rdi),%xmm10
-.byte	102,65,15,56,221,219
+	aesenclast	%xmm11,%xmm3
 	pxor	%xmm0,%xmm10
 	movdqa	0(%rsp),%xmm11
-.byte	102,65,15,56,221,228
-.byte	102,65,15,56,221,237
+	aesenclast	%xmm12,%xmm4
+	aesenclast	%xmm13,%xmm5
 	movdqa	16(%rsp),%xmm12
 	movdqa	32(%rsp),%xmm13
-.byte	102,65,15,56,221,246
-.byte	102,65,15,56,221,255
+	aesenclast	%xmm14,%xmm6
+	aesenclast	%xmm15,%xmm7
 	movdqa	48(%rsp),%xmm14
 	movdqa	64(%rsp),%xmm15
-.byte	102,68,15,56,221,193
+	aesenclast	%xmm1,%xmm8
 	movdqa	80(%rsp),%xmm0
 	movups	16-128(%rcx),%xmm1
-.byte	102,69,15,56,221,202
+	aesenclast	%xmm10,%xmm9
 
 	movups	%xmm2,(%rsi)
 	movdqa	%xmm11,%xmm2
@@ -1230,19 +1230,19 @@
 	pxor	%xmm9,%xmm9
 
 	movups	16(%rcx),%xmm0
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
+	aesenc	%xmm1,%xmm2
+	aesenc	%xmm1,%xmm3
 	leaq	32-16(%rcx,%rax,1),%rcx
 	negq	%rax
-.byte	102,15,56,220,225
+	aesenc	%xmm1,%xmm4
 	addq	$16,%rax
 	movups	(%rdi),%xmm10
-.byte	102,15,56,220,233
-.byte	102,15,56,220,241
+	aesenc	%xmm1,%xmm5
+	aesenc	%xmm1,%xmm6
 	movups	16(%rdi),%xmm11
 	movups	32(%rdi),%xmm12
-.byte	102,15,56,220,249
-.byte	102,68,15,56,220,193
+	aesenc	%xmm1,%xmm7
+	aesenc	%xmm1,%xmm8
 
 	call	.Lenc_loop8_enter
 
@@ -1273,20 +1273,20 @@
 
 .align	32
 .Lctr32_loop4:
-.byte	102,15,56,220,209
+	aesenc	%xmm1,%xmm2
 	leaq	16(%rcx),%rcx
 	decl	%eax
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
+	aesenc	%xmm1,%xmm3
+	aesenc	%xmm1,%xmm4
+	aesenc	%xmm1,%xmm5
 	movups	(%rcx),%xmm1
 	jnz	.Lctr32_loop4
-.byte	102,15,56,221,209
-.byte	102,15,56,221,217
+	aesenclast	%xmm1,%xmm2
+	aesenclast	%xmm1,%xmm3
 	movups	(%rdi),%xmm10
 	movups	16(%rdi),%xmm11
-.byte	102,15,56,221,225
-.byte	102,15,56,221,233
+	aesenclast	%xmm1,%xmm4
+	aesenclast	%xmm1,%xmm5
 	movups	32(%rdi),%xmm12
 	movups	48(%rdi),%xmm13
 
@@ -1302,16 +1302,16 @@
 
 .align	32
 .Lctr32_loop3:
-.byte	102,15,56,220,209
+	aesenc	%xmm1,%xmm2
 	leaq	16(%rcx),%rcx
 	decl	%eax
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
+	aesenc	%xmm1,%xmm3
+	aesenc	%xmm1,%xmm4
 	movups	(%rcx),%xmm1
 	jnz	.Lctr32_loop3
-.byte	102,15,56,221,209
-.byte	102,15,56,221,217
-.byte	102,15,56,221,225
+	aesenclast	%xmm1,%xmm2
+	aesenclast	%xmm1,%xmm3
+	aesenclast	%xmm1,%xmm4
 
 	movups	(%rdi),%xmm10
 	xorps	%xmm10,%xmm2
@@ -1394,12 +1394,12 @@
 	leaq	32(%rcx),%rcx
 	xorps	%xmm3,%xmm2
 .Loop_enc1_6:
-.byte	102,15,56,220,209
+	aesenc	%xmm1,%xmm2
 	decl	%eax
 	movups	(%rcx),%xmm1
 	leaq	16(%rcx),%rcx
 	jnz	.Loop_enc1_6
-.byte	102,15,56,221,209
+	aesenclast	%xmm1,%xmm2
 	movl	%r10d,%eax
 	movq	%r11,%rcx
 	movups	%xmm2,0(%rsi)
@@ -1445,12 +1445,12 @@
 	leaq	32(%rcx),%rcx
 	xorps	%xmm0,%xmm2
 .Loop_dec1_7:
-.byte	102,15,56,222,209
+	aesdec	%xmm1,%xmm2
 	decl	%r10d
 	movups	(%rcx),%xmm1
 	leaq	16(%rcx),%rcx
 	jnz	.Loop_dec1_7
-.byte	102,15,56,223,209
+	aesdeclast	%xmm1,%xmm2
 	pxor	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
 	movdqu	%xmm4,(%r8)
@@ -1509,166 +1509,166 @@
 	pxor	%xmm0,%xmm7
 	pxor	%xmm0,%xmm8
 
-.byte	102,15,56,222,209
+	aesdec	%xmm1,%xmm2
 	pxor	%xmm0,%xmm9
 	movups	32-112(%rcx),%xmm0
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-.byte	102,68,15,56,222,193
+	aesdec	%xmm1,%xmm3
+	aesdec	%xmm1,%xmm4
+	aesdec	%xmm1,%xmm5
+	aesdec	%xmm1,%xmm6
+	aesdec	%xmm1,%xmm7
+	aesdec	%xmm1,%xmm8
 	adcq	$0,%rbp
 	andq	$128,%rbp
-.byte	102,68,15,56,222,201
+	aesdec	%xmm1,%xmm9
 	addq	%rdi,%rbp
 	movups	48-112(%rcx),%xmm1
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-.byte	102,15,56,222,232
-.byte	102,15,56,222,240
-.byte	102,15,56,222,248
-.byte	102,68,15,56,222,192
-.byte	102,68,15,56,222,200
+	aesdec	%xmm0,%xmm2
+	aesdec	%xmm0,%xmm3
+	aesdec	%xmm0,%xmm4
+	aesdec	%xmm0,%xmm5
+	aesdec	%xmm0,%xmm6
+	aesdec	%xmm0,%xmm7
+	aesdec	%xmm0,%xmm8
+	aesdec	%xmm0,%xmm9
 	movups	64-112(%rcx),%xmm0
 	nop
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-.byte	102,68,15,56,222,193
-.byte	102,68,15,56,222,201
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
+	aesdec	%xmm1,%xmm4
+	aesdec	%xmm1,%xmm5
+	aesdec	%xmm1,%xmm6
+	aesdec	%xmm1,%xmm7
+	aesdec	%xmm1,%xmm8
+	aesdec	%xmm1,%xmm9
 	movups	80-112(%rcx),%xmm1
 	nop
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-.byte	102,15,56,222,232
-.byte	102,15,56,222,240
-.byte	102,15,56,222,248
-.byte	102,68,15,56,222,192
-.byte	102,68,15,56,222,200
+	aesdec	%xmm0,%xmm2
+	aesdec	%xmm0,%xmm3
+	aesdec	%xmm0,%xmm4
+	aesdec	%xmm0,%xmm5
+	aesdec	%xmm0,%xmm6
+	aesdec	%xmm0,%xmm7
+	aesdec	%xmm0,%xmm8
+	aesdec	%xmm0,%xmm9
 	movups	96-112(%rcx),%xmm0
 	nop
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-.byte	102,68,15,56,222,193
-.byte	102,68,15,56,222,201
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
+	aesdec	%xmm1,%xmm4
+	aesdec	%xmm1,%xmm5
+	aesdec	%xmm1,%xmm6
+	aesdec	%xmm1,%xmm7
+	aesdec	%xmm1,%xmm8
+	aesdec	%xmm1,%xmm9
 	movups	112-112(%rcx),%xmm1
 	nop
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-.byte	102,15,56,222,232
-.byte	102,15,56,222,240
-.byte	102,15,56,222,248
-.byte	102,68,15,56,222,192
-.byte	102,68,15,56,222,200
+	aesdec	%xmm0,%xmm2
+	aesdec	%xmm0,%xmm3
+	aesdec	%xmm0,%xmm4
+	aesdec	%xmm0,%xmm5
+	aesdec	%xmm0,%xmm6
+	aesdec	%xmm0,%xmm7
+	aesdec	%xmm0,%xmm8
+	aesdec	%xmm0,%xmm9
 	movups	128-112(%rcx),%xmm0
 	nop
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-.byte	102,68,15,56,222,193
-.byte	102,68,15,56,222,201
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
+	aesdec	%xmm1,%xmm4
+	aesdec	%xmm1,%xmm5
+	aesdec	%xmm1,%xmm6
+	aesdec	%xmm1,%xmm7
+	aesdec	%xmm1,%xmm8
+	aesdec	%xmm1,%xmm9
 	movups	144-112(%rcx),%xmm1
 	cmpl	$11,%eax
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-.byte	102,15,56,222,232
-.byte	102,15,56,222,240
-.byte	102,15,56,222,248
-.byte	102,68,15,56,222,192
-.byte	102,68,15,56,222,200
+	aesdec	%xmm0,%xmm2
+	aesdec	%xmm0,%xmm3
+	aesdec	%xmm0,%xmm4
+	aesdec	%xmm0,%xmm5
+	aesdec	%xmm0,%xmm6
+	aesdec	%xmm0,%xmm7
+	aesdec	%xmm0,%xmm8
+	aesdec	%xmm0,%xmm9
 	movups	160-112(%rcx),%xmm0
 	jb	.Lcbc_dec_done
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-.byte	102,68,15,56,222,193
-.byte	102,68,15,56,222,201
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
+	aesdec	%xmm1,%xmm4
+	aesdec	%xmm1,%xmm5
+	aesdec	%xmm1,%xmm6
+	aesdec	%xmm1,%xmm7
+	aesdec	%xmm1,%xmm8
+	aesdec	%xmm1,%xmm9
 	movups	176-112(%rcx),%xmm1
 	nop
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-.byte	102,15,56,222,232
-.byte	102,15,56,222,240
-.byte	102,15,56,222,248
-.byte	102,68,15,56,222,192
-.byte	102,68,15,56,222,200
+	aesdec	%xmm0,%xmm2
+	aesdec	%xmm0,%xmm3
+	aesdec	%xmm0,%xmm4
+	aesdec	%xmm0,%xmm5
+	aesdec	%xmm0,%xmm6
+	aesdec	%xmm0,%xmm7
+	aesdec	%xmm0,%xmm8
+	aesdec	%xmm0,%xmm9
 	movups	192-112(%rcx),%xmm0
 	je	.Lcbc_dec_done
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-.byte	102,68,15,56,222,193
-.byte	102,68,15,56,222,201
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
+	aesdec	%xmm1,%xmm4
+	aesdec	%xmm1,%xmm5
+	aesdec	%xmm1,%xmm6
+	aesdec	%xmm1,%xmm7
+	aesdec	%xmm1,%xmm8
+	aesdec	%xmm1,%xmm9
 	movups	208-112(%rcx),%xmm1
 	nop
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-.byte	102,15,56,222,232
-.byte	102,15,56,222,240
-.byte	102,15,56,222,248
-.byte	102,68,15,56,222,192
-.byte	102,68,15,56,222,200
+	aesdec	%xmm0,%xmm2
+	aesdec	%xmm0,%xmm3
+	aesdec	%xmm0,%xmm4
+	aesdec	%xmm0,%xmm5
+	aesdec	%xmm0,%xmm6
+	aesdec	%xmm0,%xmm7
+	aesdec	%xmm0,%xmm8
+	aesdec	%xmm0,%xmm9
 	movups	224-112(%rcx),%xmm0
 	jmp	.Lcbc_dec_done
 .align	16
 .Lcbc_dec_done:
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
+	aesdec	%xmm1,%xmm2
+	aesdec	%xmm1,%xmm3
 	pxor	%xmm0,%xmm10
 	pxor	%xmm0,%xmm11
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
+	aesdec	%xmm1,%xmm4
+	aesdec	%xmm1,%xmm5
 	pxor	%xmm0,%xmm12
 	pxor	%xmm0,%xmm13
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
+	aesdec	%xmm1,%xmm6
+	aesdec	%xmm1,%xmm7
 	pxor	%xmm0,%xmm14
 	pxor	%xmm0,%xmm15
-.byte	102,68,15,56,222,193
-.byte	102,68,15,56,222,201
+	aesdec	%xmm1,%xmm8
+	aesdec	%xmm1,%xmm9
 	movdqu	80(%rdi),%xmm1
 
-.byte	102,65,15,56,223,210
+	aesdeclast	%xmm10,%xmm2
 	movdqu	96(%rdi),%xmm10
 	pxor	%xmm0,%xmm1
-.byte	102,65,15,56,223,219
+	aesdeclast	%xmm11,%xmm3
 	pxor	%xmm0,%xmm10
 	movdqu	112(%rdi),%xmm0
-.byte	102,65,15,56,223,228
+	aesdeclast	%xmm12,%xmm4
 	leaq	128(%rdi),%rdi
 	movdqu	0(%rbp),%xmm11
-.byte	102,65,15,56,223,237
-.byte	102,65,15,56,223,246
+	aesdeclast	%xmm13,%xmm5
+	aesdeclast	%xmm14,%xmm6
 	movdqu	16(%rbp),%xmm12
 	movdqu	32(%rbp),%xmm13
-.byte	102,65,15,56,223,255
-.byte	102,68,15,56,223,193
+	aesdeclast	%xmm15,%xmm7
+	aesdeclast	%xmm1,%xmm8
 	movdqu	48(%rbp),%xmm14
 	movdqu	64(%rbp),%xmm15
-.byte	102,69,15,56,223,202
+	aesdeclast	%xmm10,%xmm9
 	movdqa	%xmm0,%xmm10
 	movdqu	80(%rbp),%xmm1
 	movups	-112(%rcx),%xmm0
@@ -1812,12 +1812,12 @@
 	leaq	32(%rcx),%rcx
 	xorps	%xmm0,%xmm2
 .Loop_dec1_8:
-.byte	102,15,56,222,209
+	aesdec	%xmm1,%xmm2
 	decl	%eax
 	movups	(%rcx),%xmm1
 	leaq	16(%rcx),%rcx
 	jnz	.Loop_dec1_8
-.byte	102,15,56,223,209
+	aesdeclast	%xmm1,%xmm2
 	xorps	%xmm10,%xmm2
 	movaps	%xmm11,%xmm10
 	jmp	.Lcbc_dec_tail_collected
@@ -1928,8 +1928,8 @@
 .Ldec_key_inverse:
 	movups	(%rdi),%xmm0
 	movups	(%rdx),%xmm1
-.byte	102,15,56,219,192
-.byte	102,15,56,219,201
+	aesimc	%xmm0,%xmm0
+	aesimc	%xmm1,%xmm1
 	leaq	16(%rdi),%rdi
 	leaq	-16(%rdx),%rdx
 	movups	%xmm0,16(%rdx)
@@ -1938,7 +1938,7 @@
 	ja	.Ldec_key_inverse
 
 	movups	(%rdi),%xmm0
-.byte	102,15,56,219,192
+	aesimc	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
 	movups	%xmm0,(%rdx)
 	pxor	%xmm0,%xmm0
@@ -1974,25 +1974,25 @@
 	movl	$9,%esi
 
 	movups	%xmm0,(%rdx)
-.byte	102,15,58,223,200,1
+	aeskeygenassist	$0x1,%xmm0,%xmm1
 	call	.Lkey_expansion_128_cold
-.byte	102,15,58,223,200,2
+	aeskeygenassist	$0x2,%xmm0,%xmm1
 	call	.Lkey_expansion_128
-.byte	102,15,58,223,200,4
+	aeskeygenassist	$0x4,%xmm0,%xmm1
 	call	.Lkey_expansion_128
-.byte	102,15,58,223,200,8
+	aeskeygenassist	$0x8,%xmm0,%xmm1
 	call	.Lkey_expansion_128
-.byte	102,15,58,223,200,16
+	aeskeygenassist	$0x10,%xmm0,%xmm1
 	call	.Lkey_expansion_128
-.byte	102,15,58,223,200,32
+	aeskeygenassist	$0x20,%xmm0,%xmm1
 	call	.Lkey_expansion_128
-.byte	102,15,58,223,200,64
+	aeskeygenassist	$0x40,%xmm0,%xmm1
 	call	.Lkey_expansion_128
-.byte	102,15,58,223,200,128
+	aeskeygenassist	$0x80,%xmm0,%xmm1
 	call	.Lkey_expansion_128
-.byte	102,15,58,223,200,27
+	aeskeygenassist	$0x1b,%xmm0,%xmm1
 	call	.Lkey_expansion_128
-.byte	102,15,58,223,200,54
+	aeskeygenassist	$0x36,%xmm0,%xmm1
 	call	.Lkey_expansion_128
 	movups	%xmm0,(%rax)
 	movl	%esi,80(%rax)
@@ -2005,21 +2005,21 @@
 	movl	$11,%esi
 
 	movups	%xmm0,(%rdx)
-.byte	102,15,58,223,202,1
+	aeskeygenassist	$0x1,%xmm2,%xmm1
 	call	.Lkey_expansion_192a_cold
-.byte	102,15,58,223,202,2
+	aeskeygenassist	$0x2,%xmm2,%xmm1
 	call	.Lkey_expansion_192b
-.byte	102,15,58,223,202,4
+	aeskeygenassist	$0x4,%xmm2,%xmm1
 	call	.Lkey_expansion_192a
-.byte	102,15,58,223,202,8
+	aeskeygenassist	$0x8,%xmm2,%xmm1
 	call	.Lkey_expansion_192b
-.byte	102,15,58,223,202,16
+	aeskeygenassist	$0x10,%xmm2,%xmm1
 	call	.Lkey_expansion_192a
-.byte	102,15,58,223,202,32
+	aeskeygenassist	$0x20,%xmm2,%xmm1
 	call	.Lkey_expansion_192b
-.byte	102,15,58,223,202,64
+	aeskeygenassist	$0x40,%xmm2,%xmm1
 	call	.Lkey_expansion_192a
-.byte	102,15,58,223,202,128
+	aeskeygenassist	$0x80,%xmm2,%xmm1
 	call	.Lkey_expansion_192b
 	movups	%xmm0,(%rax)
 	movl	%esi,48(%rax)
@@ -2034,31 +2034,31 @@
 
 	movups	%xmm0,(%rdx)
 	movups	%xmm2,16(%rdx)
-.byte	102,15,58,223,202,1
+	aeskeygenassist	$0x1,%xmm2,%xmm1
 	call	.Lkey_expansion_256a_cold
-.byte	102,15,58,223,200,1
+	aeskeygenassist	$0x1,%xmm0,%xmm1
 	call	.Lkey_expansion_256b
-.byte	102,15,58,223,202,2
+	aeskeygenassist	$0x2,%xmm2,%xmm1
 	call	.Lkey_expansion_256a
-.byte	102,15,58,223,200,2
+	aeskeygenassist	$0x2,%xmm0,%xmm1
 	call	.Lkey_expansion_256b
-.byte	102,15,58,223,202,4
+	aeskeygenassist	$0x4,%xmm2,%xmm1
 	call	.Lkey_expansion_256a
-.byte	102,15,58,223,200,4
+	aeskeygenassist	$0x4,%xmm0,%xmm1
 	call	.Lkey_expansion_256b
-.byte	102,15,58,223,202,8
+	aeskeygenassist	$0x8,%xmm2,%xmm1
 	call	.Lkey_expansion_256a
-.byte	102,15,58,223,200,8
+	aeskeygenassist	$0x8,%xmm0,%xmm1
 	call	.Lkey_expansion_256b
-.byte	102,15,58,223,202,16
+	aeskeygenassist	$0x10,%xmm2,%xmm1
 	call	.Lkey_expansion_256a
-.byte	102,15,58,223,200,16
+	aeskeygenassist	$0x10,%xmm0,%xmm1
 	call	.Lkey_expansion_256b
-.byte	102,15,58,223,202,32
+	aeskeygenassist	$0x20,%xmm2,%xmm1
 	call	.Lkey_expansion_256a
-.byte	102,15,58,223,200,32
+	aeskeygenassist	$0x20,%xmm0,%xmm1
 	call	.Lkey_expansion_256b
-.byte	102,15,58,223,202,64
+	aeskeygenassist	$0x40,%xmm2,%xmm1
 	call	.Lkey_expansion_256a
 	movups	%xmm0,(%rax)
 	movl	%esi,16(%rax)
@@ -2196,8 +2196,8 @@
 
 .align	16
 .Loop_key128:
-.byte	102,15,56,0,197
-.byte	102,15,56,221,196
+	pshufb	%xmm5,%xmm0
+	aesenclast	%xmm4,%xmm0
 	pslld	$1,%xmm4
 	leaq	16(%rax),%rax
 
@@ -2218,8 +2218,8 @@
 
 	movdqa	.Lkey_rcon1b(%rip),%xmm4
 
-.byte	102,15,56,0,197
-.byte	102,15,56,221,196
+	pshufb	%xmm5,%xmm0
+	aesenclast	%xmm4,%xmm0
 	pslld	$1,%xmm4
 
 	movdqa	%xmm2,%xmm3
@@ -2234,8 +2234,8 @@
 	movdqu	%xmm0,(%rax)
 
 	movdqa	%xmm0,%xmm2
-.byte	102,15,56,0,197
-.byte	102,15,56,221,196
+	pshufb	%xmm5,%xmm0
+	aesenclast	%xmm4,%xmm0
 
 	movdqa	%xmm2,%xmm3
 	pslldq	$4,%xmm2
@@ -2266,8 +2266,8 @@
 .Loop_key192:
 	movq	%xmm2,0(%rax)
 	movdqa	%xmm2,%xmm1
-.byte	102,15,56,0,213
-.byte	102,15,56,221,212
+	pshufb	%xmm5,%xmm2
+	aesenclast	%xmm4,%xmm2
 	pslld	$1,%xmm4
 	leaq	24(%rax),%rax
 
@@ -2310,8 +2310,8 @@
 
 .align	16
 .Loop_key256:
-.byte	102,15,56,0,213
-.byte	102,15,56,221,212
+	pshufb	%xmm5,%xmm2
+	aesenclast	%xmm4,%xmm2
 
 	movdqa	%xmm0,%xmm3
 	pslldq	$4,%xmm0
@@ -2330,7 +2330,7 @@
 
 	pshufd	$0xff,%xmm0,%xmm2
 	pxor	%xmm3,%xmm3
-.byte	102,15,56,221,211
+	aesenclast	%xmm3,%xmm2
 
 	movdqa	%xmm1,%xmm3
 	pslldq	$4,%xmm1
diff --git a/gen/bcm/aesni-x86_64-win.asm b/gen/bcm/aesni-x86_64-win.asm
index c585507..8e592cd 100644
--- a/gen/bcm/aesni-x86_64-win.asm
+++ b/gen/bcm/aesni-x86_64-win.asm
@@ -30,12 +30,12 @@
 	lea	r8,[32+r8]
 	xorps	xmm2,xmm0
 $L$oop_enc1_1:
-	DB	102,15,56,220,209
+	aesenc	xmm2,xmm1
 	dec	eax
 	movups	xmm1,XMMWORD[r8]
 	lea	r8,[16+r8]
 	jnz	NEAR $L$oop_enc1_1
-	DB	102,15,56,221,209
+	aesenclast	xmm2,xmm1
 	pxor	xmm0,xmm0
 	pxor	xmm1,xmm1
 	movups	XMMWORD[rdx],xmm2
@@ -57,12 +57,12 @@
 	lea	r8,[32+r8]
 	xorps	xmm2,xmm0
 $L$oop_dec1_2:
-	DB	102,15,56,222,209
+	aesdec	xmm2,xmm1
 	dec	eax
 	movups	xmm1,XMMWORD[r8]
 	lea	r8,[16+r8]
 	jnz	NEAR $L$oop_dec1_2
-	DB	102,15,56,223,209
+	aesdeclast	xmm2,xmm1
 	pxor	xmm0,xmm0
 	pxor	xmm1,xmm1
 	movups	XMMWORD[rdx],xmm2
@@ -85,19 +85,19 @@
 	add	rax,16
 
 $L$enc_loop2:
-	DB	102,15,56,220,209
-	DB	102,15,56,220,217
+	aesenc	xmm2,xmm1
+	aesenc	xmm3,xmm1
 	movups	xmm1,XMMWORD[rax*1+rcx]
 	add	rax,32
-	DB	102,15,56,220,208
-	DB	102,15,56,220,216
+	aesenc	xmm2,xmm0
+	aesenc	xmm3,xmm0
 	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
 	jnz	NEAR $L$enc_loop2
 
-	DB	102,15,56,220,209
-	DB	102,15,56,220,217
-	DB	102,15,56,221,208
-	DB	102,15,56,221,216
+	aesenc	xmm2,xmm1
+	aesenc	xmm3,xmm1
+	aesenclast	xmm2,xmm0
+	aesenclast	xmm3,xmm0
 	ret
 
 
@@ -116,19 +116,19 @@
 	add	rax,16
 
 $L$dec_loop2:
-	DB	102,15,56,222,209
-	DB	102,15,56,222,217
+	aesdec	xmm2,xmm1
+	aesdec	xmm3,xmm1
 	movups	xmm1,XMMWORD[rax*1+rcx]
 	add	rax,32
-	DB	102,15,56,222,208
-	DB	102,15,56,222,216
+	aesdec	xmm2,xmm0
+	aesdec	xmm3,xmm0
 	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
 	jnz	NEAR $L$dec_loop2
 
-	DB	102,15,56,222,209
-	DB	102,15,56,222,217
-	DB	102,15,56,223,208
-	DB	102,15,56,223,216
+	aesdec	xmm2,xmm1
+	aesdec	xmm3,xmm1
+	aesdeclast	xmm2,xmm0
+	aesdeclast	xmm3,xmm0
 	ret
 
 
@@ -148,23 +148,23 @@
 	add	rax,16
 
 $L$enc_loop3:
-	DB	102,15,56,220,209
-	DB	102,15,56,220,217
-	DB	102,15,56,220,225
+	aesenc	xmm2,xmm1
+	aesenc	xmm3,xmm1
+	aesenc	xmm4,xmm1
 	movups	xmm1,XMMWORD[rax*1+rcx]
 	add	rax,32
-	DB	102,15,56,220,208
-	DB	102,15,56,220,216
-	DB	102,15,56,220,224
+	aesenc	xmm2,xmm0
+	aesenc	xmm3,xmm0
+	aesenc	xmm4,xmm0
 	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
 	jnz	NEAR $L$enc_loop3
 
-	DB	102,15,56,220,209
-	DB	102,15,56,220,217
-	DB	102,15,56,220,225
-	DB	102,15,56,221,208
-	DB	102,15,56,221,216
-	DB	102,15,56,221,224
+	aesenc	xmm2,xmm1
+	aesenc	xmm3,xmm1
+	aesenc	xmm4,xmm1
+	aesenclast	xmm2,xmm0
+	aesenclast	xmm3,xmm0
+	aesenclast	xmm4,xmm0
 	ret
 
 
@@ -184,23 +184,23 @@
 	add	rax,16
 
 $L$dec_loop3:
-	DB	102,15,56,222,209
-	DB	102,15,56,222,217
-	DB	102,15,56,222,225
+	aesdec	xmm2,xmm1
+	aesdec	xmm3,xmm1
+	aesdec	xmm4,xmm1
 	movups	xmm1,XMMWORD[rax*1+rcx]
 	add	rax,32
-	DB	102,15,56,222,208
-	DB	102,15,56,222,216
-	DB	102,15,56,222,224
+	aesdec	xmm2,xmm0
+	aesdec	xmm3,xmm0
+	aesdec	xmm4,xmm0
 	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
 	jnz	NEAR $L$dec_loop3
 
-	DB	102,15,56,222,209
-	DB	102,15,56,222,217
-	DB	102,15,56,222,225
-	DB	102,15,56,223,208
-	DB	102,15,56,223,216
-	DB	102,15,56,223,224
+	aesdec	xmm2,xmm1
+	aesdec	xmm3,xmm1
+	aesdec	xmm4,xmm1
+	aesdeclast	xmm2,xmm0
+	aesdeclast	xmm3,xmm0
+	aesdeclast	xmm4,xmm0
 	ret
 
 
@@ -222,27 +222,27 @@
 	add	rax,16
 
 $L$enc_loop4:
-	DB	102,15,56,220,209
-	DB	102,15,56,220,217
-	DB	102,15,56,220,225
-	DB	102,15,56,220,233
+	aesenc	xmm2,xmm1
+	aesenc	xmm3,xmm1
+	aesenc	xmm4,xmm1
+	aesenc	xmm5,xmm1
 	movups	xmm1,XMMWORD[rax*1+rcx]
 	add	rax,32
-	DB	102,15,56,220,208
-	DB	102,15,56,220,216
-	DB	102,15,56,220,224
-	DB	102,15,56,220,232
+	aesenc	xmm2,xmm0
+	aesenc	xmm3,xmm0
+	aesenc	xmm4,xmm0
+	aesenc	xmm5,xmm0
 	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
 	jnz	NEAR $L$enc_loop4
 
-	DB	102,15,56,220,209
-	DB	102,15,56,220,217
-	DB	102,15,56,220,225
-	DB	102,15,56,220,233
-	DB	102,15,56,221,208
-	DB	102,15,56,221,216
-	DB	102,15,56,221,224
-	DB	102,15,56,221,232
+	aesenc	xmm2,xmm1
+	aesenc	xmm3,xmm1
+	aesenc	xmm4,xmm1
+	aesenc	xmm5,xmm1
+	aesenclast	xmm2,xmm0
+	aesenclast	xmm3,xmm0
+	aesenclast	xmm4,xmm0
+	aesenclast	xmm5,xmm0
 	ret
 
 
@@ -264,27 +264,27 @@
 	add	rax,16
 
 $L$dec_loop4:
-	DB	102,15,56,222,209
-	DB	102,15,56,222,217
-	DB	102,15,56,222,225
-	DB	102,15,56,222,233
+	aesdec	xmm2,xmm1
+	aesdec	xmm3,xmm1
+	aesdec	xmm4,xmm1
+	aesdec	xmm5,xmm1
 	movups	xmm1,XMMWORD[rax*1+rcx]
 	add	rax,32
-	DB	102,15,56,222,208
-	DB	102,15,56,222,216
-	DB	102,15,56,222,224
-	DB	102,15,56,222,232
+	aesdec	xmm2,xmm0
+	aesdec	xmm3,xmm0
+	aesdec	xmm4,xmm0
+	aesdec	xmm5,xmm0
 	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
 	jnz	NEAR $L$dec_loop4
 
-	DB	102,15,56,222,209
-	DB	102,15,56,222,217
-	DB	102,15,56,222,225
-	DB	102,15,56,222,233
-	DB	102,15,56,223,208
-	DB	102,15,56,223,216
-	DB	102,15,56,223,224
-	DB	102,15,56,223,232
+	aesdec	xmm2,xmm1
+	aesdec	xmm3,xmm1
+	aesdec	xmm4,xmm1
+	aesdec	xmm5,xmm1
+	aesdeclast	xmm2,xmm0
+	aesdeclast	xmm3,xmm0
+	aesdeclast	xmm4,xmm0
+	aesdeclast	xmm5,xmm0
 	ret
 
 
@@ -298,49 +298,49 @@
 	xorps	xmm2,xmm0
 	pxor	xmm3,xmm0
 	pxor	xmm4,xmm0
-	DB	102,15,56,220,209
+	aesenc	xmm2,xmm1
 	lea	rcx,[32+rax*1+rcx]
 	neg	rax
-	DB	102,15,56,220,217
+	aesenc	xmm3,xmm1
 	pxor	xmm5,xmm0
 	pxor	xmm6,xmm0
-	DB	102,15,56,220,225
+	aesenc	xmm4,xmm1
 	pxor	xmm7,xmm0
 	movups	xmm0,XMMWORD[rax*1+rcx]
 	add	rax,16
 	jmp	NEAR $L$enc_loop6_enter
 ALIGN	16
 $L$enc_loop6:
-	DB	102,15,56,220,209
-	DB	102,15,56,220,217
-	DB	102,15,56,220,225
+	aesenc	xmm2,xmm1
+	aesenc	xmm3,xmm1
+	aesenc	xmm4,xmm1
 $L$enc_loop6_enter:
-	DB	102,15,56,220,233
-	DB	102,15,56,220,241
-	DB	102,15,56,220,249
+	aesenc	xmm5,xmm1
+	aesenc	xmm6,xmm1
+	aesenc	xmm7,xmm1
 	movups	xmm1,XMMWORD[rax*1+rcx]
 	add	rax,32
-	DB	102,15,56,220,208
-	DB	102,15,56,220,216
-	DB	102,15,56,220,224
-	DB	102,15,56,220,232
-	DB	102,15,56,220,240
-	DB	102,15,56,220,248
+	aesenc	xmm2,xmm0
+	aesenc	xmm3,xmm0
+	aesenc	xmm4,xmm0
+	aesenc	xmm5,xmm0
+	aesenc	xmm6,xmm0
+	aesenc	xmm7,xmm0
 	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
 	jnz	NEAR $L$enc_loop6
 
-	DB	102,15,56,220,209
-	DB	102,15,56,220,217
-	DB	102,15,56,220,225
-	DB	102,15,56,220,233
-	DB	102,15,56,220,241
-	DB	102,15,56,220,249
-	DB	102,15,56,221,208
-	DB	102,15,56,221,216
-	DB	102,15,56,221,224
-	DB	102,15,56,221,232
-	DB	102,15,56,221,240
-	DB	102,15,56,221,248
+	aesenc	xmm2,xmm1
+	aesenc	xmm3,xmm1
+	aesenc	xmm4,xmm1
+	aesenc	xmm5,xmm1
+	aesenc	xmm6,xmm1
+	aesenc	xmm7,xmm1
+	aesenclast	xmm2,xmm0
+	aesenclast	xmm3,xmm0
+	aesenclast	xmm4,xmm0
+	aesenclast	xmm5,xmm0
+	aesenclast	xmm6,xmm0
+	aesenclast	xmm7,xmm0
 	ret
 
 
@@ -354,49 +354,49 @@
 	xorps	xmm2,xmm0
 	pxor	xmm3,xmm0
 	pxor	xmm4,xmm0
-	DB	102,15,56,222,209
+	aesdec	xmm2,xmm1
 	lea	rcx,[32+rax*1+rcx]
 	neg	rax
-	DB	102,15,56,222,217
+	aesdec	xmm3,xmm1
 	pxor	xmm5,xmm0
 	pxor	xmm6,xmm0
-	DB	102,15,56,222,225
+	aesdec	xmm4,xmm1
 	pxor	xmm7,xmm0
 	movups	xmm0,XMMWORD[rax*1+rcx]
 	add	rax,16
 	jmp	NEAR $L$dec_loop6_enter
 ALIGN	16
 $L$dec_loop6:
-	DB	102,15,56,222,209
-	DB	102,15,56,222,217
-	DB	102,15,56,222,225
+	aesdec	xmm2,xmm1
+	aesdec	xmm3,xmm1
+	aesdec	xmm4,xmm1
 $L$dec_loop6_enter:
-	DB	102,15,56,222,233
-	DB	102,15,56,222,241
-	DB	102,15,56,222,249
+	aesdec	xmm5,xmm1
+	aesdec	xmm6,xmm1
+	aesdec	xmm7,xmm1
 	movups	xmm1,XMMWORD[rax*1+rcx]
 	add	rax,32
-	DB	102,15,56,222,208
-	DB	102,15,56,222,216
-	DB	102,15,56,222,224
-	DB	102,15,56,222,232
-	DB	102,15,56,222,240
-	DB	102,15,56,222,248
+	aesdec	xmm2,xmm0
+	aesdec	xmm3,xmm0
+	aesdec	xmm4,xmm0
+	aesdec	xmm5,xmm0
+	aesdec	xmm6,xmm0
+	aesdec	xmm7,xmm0
 	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
 	jnz	NEAR $L$dec_loop6
 
-	DB	102,15,56,222,209
-	DB	102,15,56,222,217
-	DB	102,15,56,222,225
-	DB	102,15,56,222,233
-	DB	102,15,56,222,241
-	DB	102,15,56,222,249
-	DB	102,15,56,223,208
-	DB	102,15,56,223,216
-	DB	102,15,56,223,224
-	DB	102,15,56,223,232
-	DB	102,15,56,223,240
-	DB	102,15,56,223,248
+	aesdec	xmm2,xmm1
+	aesdec	xmm3,xmm1
+	aesdec	xmm4,xmm1
+	aesdec	xmm5,xmm1
+	aesdec	xmm6,xmm1
+	aesdec	xmm7,xmm1
+	aesdeclast	xmm2,xmm0
+	aesdeclast	xmm3,xmm0
+	aesdeclast	xmm4,xmm0
+	aesdeclast	xmm5,xmm0
+	aesdeclast	xmm6,xmm0
+	aesdeclast	xmm7,xmm0
 	ret
 
 
@@ -414,55 +414,55 @@
 	pxor	xmm6,xmm0
 	lea	rcx,[32+rax*1+rcx]
 	neg	rax
-	DB	102,15,56,220,209
+	aesenc	xmm2,xmm1
 	pxor	xmm7,xmm0
 	pxor	xmm8,xmm0
-	DB	102,15,56,220,217
+	aesenc	xmm3,xmm1
 	pxor	xmm9,xmm0
 	movups	xmm0,XMMWORD[rax*1+rcx]
 	add	rax,16
 	jmp	NEAR $L$enc_loop8_inner
 ALIGN	16
 $L$enc_loop8:
-	DB	102,15,56,220,209
-	DB	102,15,56,220,217
+	aesenc	xmm2,xmm1
+	aesenc	xmm3,xmm1
 $L$enc_loop8_inner:
-	DB	102,15,56,220,225
-	DB	102,15,56,220,233
-	DB	102,15,56,220,241
-	DB	102,15,56,220,249
-	DB	102,68,15,56,220,193
-	DB	102,68,15,56,220,201
+	aesenc	xmm4,xmm1
+	aesenc	xmm5,xmm1
+	aesenc	xmm6,xmm1
+	aesenc	xmm7,xmm1
+	aesenc	xmm8,xmm1
+	aesenc	xmm9,xmm1
 $L$enc_loop8_enter:
 	movups	xmm1,XMMWORD[rax*1+rcx]
 	add	rax,32
-	DB	102,15,56,220,208
-	DB	102,15,56,220,216
-	DB	102,15,56,220,224
-	DB	102,15,56,220,232
-	DB	102,15,56,220,240
-	DB	102,15,56,220,248
-	DB	102,68,15,56,220,192
-	DB	102,68,15,56,220,200
+	aesenc	xmm2,xmm0
+	aesenc	xmm3,xmm0
+	aesenc	xmm4,xmm0
+	aesenc	xmm5,xmm0
+	aesenc	xmm6,xmm0
+	aesenc	xmm7,xmm0
+	aesenc	xmm8,xmm0
+	aesenc	xmm9,xmm0
 	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
 	jnz	NEAR $L$enc_loop8
 
-	DB	102,15,56,220,209
-	DB	102,15,56,220,217
-	DB	102,15,56,220,225
-	DB	102,15,56,220,233
-	DB	102,15,56,220,241
-	DB	102,15,56,220,249
-	DB	102,68,15,56,220,193
-	DB	102,68,15,56,220,201
-	DB	102,15,56,221,208
-	DB	102,15,56,221,216
-	DB	102,15,56,221,224
-	DB	102,15,56,221,232
-	DB	102,15,56,221,240
-	DB	102,15,56,221,248
-	DB	102,68,15,56,221,192
-	DB	102,68,15,56,221,200
+	aesenc	xmm2,xmm1
+	aesenc	xmm3,xmm1
+	aesenc	xmm4,xmm1
+	aesenc	xmm5,xmm1
+	aesenc	xmm6,xmm1
+	aesenc	xmm7,xmm1
+	aesenc	xmm8,xmm1
+	aesenc	xmm9,xmm1
+	aesenclast	xmm2,xmm0
+	aesenclast	xmm3,xmm0
+	aesenclast	xmm4,xmm0
+	aesenclast	xmm5,xmm0
+	aesenclast	xmm6,xmm0
+	aesenclast	xmm7,xmm0
+	aesenclast	xmm8,xmm0
+	aesenclast	xmm9,xmm0
 	ret
 
 
@@ -480,55 +480,55 @@
 	pxor	xmm6,xmm0
 	lea	rcx,[32+rax*1+rcx]
 	neg	rax
-	DB	102,15,56,222,209
+	aesdec	xmm2,xmm1
 	pxor	xmm7,xmm0
 	pxor	xmm8,xmm0
-	DB	102,15,56,222,217
+	aesdec	xmm3,xmm1
 	pxor	xmm9,xmm0
 	movups	xmm0,XMMWORD[rax*1+rcx]
 	add	rax,16
 	jmp	NEAR $L$dec_loop8_inner
 ALIGN	16
 $L$dec_loop8:
-	DB	102,15,56,222,209
-	DB	102,15,56,222,217
+	aesdec	xmm2,xmm1
+	aesdec	xmm3,xmm1
 $L$dec_loop8_inner:
-	DB	102,15,56,222,225
-	DB	102,15,56,222,233
-	DB	102,15,56,222,241
-	DB	102,15,56,222,249
-	DB	102,68,15,56,222,193
-	DB	102,68,15,56,222,201
+	aesdec	xmm4,xmm1
+	aesdec	xmm5,xmm1
+	aesdec	xmm6,xmm1
+	aesdec	xmm7,xmm1
+	aesdec	xmm8,xmm1
+	aesdec	xmm9,xmm1
 $L$dec_loop8_enter:
 	movups	xmm1,XMMWORD[rax*1+rcx]
 	add	rax,32
-	DB	102,15,56,222,208
-	DB	102,15,56,222,216
-	DB	102,15,56,222,224
-	DB	102,15,56,222,232
-	DB	102,15,56,222,240
-	DB	102,15,56,222,248
-	DB	102,68,15,56,222,192
-	DB	102,68,15,56,222,200
+	aesdec	xmm2,xmm0
+	aesdec	xmm3,xmm0
+	aesdec	xmm4,xmm0
+	aesdec	xmm5,xmm0
+	aesdec	xmm6,xmm0
+	aesdec	xmm7,xmm0
+	aesdec	xmm8,xmm0
+	aesdec	xmm9,xmm0
 	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
 	jnz	NEAR $L$dec_loop8
 
-	DB	102,15,56,222,209
-	DB	102,15,56,222,217
-	DB	102,15,56,222,225
-	DB	102,15,56,222,233
-	DB	102,15,56,222,241
-	DB	102,15,56,222,249
-	DB	102,68,15,56,222,193
-	DB	102,68,15,56,222,201
-	DB	102,15,56,223,208
-	DB	102,15,56,223,216
-	DB	102,15,56,223,224
-	DB	102,15,56,223,232
-	DB	102,15,56,223,240
-	DB	102,15,56,223,248
-	DB	102,68,15,56,223,192
-	DB	102,68,15,56,223,200
+	aesdec	xmm2,xmm1
+	aesdec	xmm3,xmm1
+	aesdec	xmm4,xmm1
+	aesdec	xmm5,xmm1
+	aesdec	xmm6,xmm1
+	aesdec	xmm7,xmm1
+	aesdec	xmm8,xmm1
+	aesdec	xmm9,xmm1
+	aesdeclast	xmm2,xmm0
+	aesdeclast	xmm3,xmm0
+	aesdeclast	xmm4,xmm0
+	aesdeclast	xmm5,xmm0
+	aesdeclast	xmm6,xmm0
+	aesdeclast	xmm7,xmm0
+	aesdeclast	xmm8,xmm0
+	aesdeclast	xmm9,xmm0
 	ret
 
 
@@ -656,12 +656,12 @@
 	lea	rcx,[32+rcx]
 	xorps	xmm2,xmm0
 $L$oop_enc1_3:
-	DB	102,15,56,220,209
+	aesenc	xmm2,xmm1
 	dec	eax
 	movups	xmm1,XMMWORD[rcx]
 	lea	rcx,[16+rcx]
 	jnz	NEAR $L$oop_enc1_3
-	DB	102,15,56,221,209
+	aesenclast	xmm2,xmm1
 	movups	XMMWORD[rsi],xmm2
 	jmp	NEAR $L$ecb_ret
 ALIGN	16
@@ -817,12 +817,12 @@
 	lea	rcx,[32+rcx]
 	xorps	xmm2,xmm0
 $L$oop_dec1_4:
-	DB	102,15,56,222,209
+	aesdec	xmm2,xmm1
 	dec	eax
 	movups	xmm1,XMMWORD[rcx]
 	lea	rcx,[16+rcx]
 	jnz	NEAR $L$oop_dec1_4
-	DB	102,15,56,223,209
+	aesdeclast	xmm2,xmm1
 	movups	XMMWORD[rsi],xmm2
 	pxor	xmm2,xmm2
 	jmp	NEAR $L$ecb_ret
@@ -939,12 +939,12 @@
 	lea	rcx,[32+rcx]
 	xorps	xmm2,xmm0
 $L$oop_enc1_5:
-	DB	102,15,56,220,209
+	aesenc	xmm2,xmm1
 	dec	edx
 	movups	xmm1,XMMWORD[rcx]
 	lea	rcx,[16+rcx]
 	jnz	NEAR $L$oop_enc1_5
-	DB	102,15,56,221,209
+	aesenclast	xmm2,xmm1
 	pxor	xmm0,xmm0
 	pxor	xmm1,xmm1
 	xorps	xmm2,xmm3
@@ -998,17 +998,17 @@
 	bswap	edx
 	xor	eax,ebp
 	xor	edx,ebp
-DB	102,15,58,34,216,3
+	pinsrd	xmm3,eax,3
 	lea	rax,[3+r8]
 	movdqa	XMMWORD[16+rsp],xmm3
-DB	102,15,58,34,226,3
+	pinsrd	xmm4,edx,3
 	bswap	eax
 	mov	rdx,r10
 	lea	r10,[4+r8]
 	movdqa	XMMWORD[32+rsp],xmm4
 	xor	eax,ebp
 	bswap	r10d
-DB	102,15,58,34,232,3
+	pinsrd	xmm5,eax,3
 	xor	r10d,ebp
 	movdqa	XMMWORD[48+rsp],xmm5
 	lea	r9,[5+r8]
@@ -1042,163 +1042,163 @@
 $L$ctr32_loop8:
 	add	r8d,8
 	movdqa	xmm8,XMMWORD[96+rsp]
-	DB	102,15,56,220,209
+	aesenc	xmm2,xmm1
 	mov	r9d,r8d
 	movdqa	xmm9,XMMWORD[112+rsp]
-	DB	102,15,56,220,217
+	aesenc	xmm3,xmm1
 	bswap	r9d
 	movups	xmm0,XMMWORD[((32-128))+rcx]
-	DB	102,15,56,220,225
+	aesenc	xmm4,xmm1
 	xor	r9d,ebp
 	nop
-	DB	102,15,56,220,233
+	aesenc	xmm5,xmm1
 	mov	DWORD[((0+12))+rsp],r9d
 	lea	r9,[1+r8]
-	DB	102,15,56,220,241
-	DB	102,15,56,220,249
-	DB	102,68,15,56,220,193
-	DB	102,68,15,56,220,201
+	aesenc	xmm6,xmm1
+	aesenc	xmm7,xmm1
+	aesenc	xmm8,xmm1
+	aesenc	xmm9,xmm1
 	movups	xmm1,XMMWORD[((48-128))+rcx]
 	bswap	r9d
-	DB	102,15,56,220,208
-	DB	102,15,56,220,216
+	aesenc	xmm2,xmm0
+	aesenc	xmm3,xmm0
 	xor	r9d,ebp
 	DB	0x66,0x90
-	DB	102,15,56,220,224
-	DB	102,15,56,220,232
+	aesenc	xmm4,xmm0
+	aesenc	xmm5,xmm0
 	mov	DWORD[((16+12))+rsp],r9d
 	lea	r9,[2+r8]
-	DB	102,15,56,220,240
-	DB	102,15,56,220,248
-	DB	102,68,15,56,220,192
-	DB	102,68,15,56,220,200
+	aesenc	xmm6,xmm0
+	aesenc	xmm7,xmm0
+	aesenc	xmm8,xmm0
+	aesenc	xmm9,xmm0
 	movups	xmm0,XMMWORD[((64-128))+rcx]
 	bswap	r9d
-	DB	102,15,56,220,209
-	DB	102,15,56,220,217
+	aesenc	xmm2,xmm1
+	aesenc	xmm3,xmm1
 	xor	r9d,ebp
 	DB	0x66,0x90
-	DB	102,15,56,220,225
-	DB	102,15,56,220,233
+	aesenc	xmm4,xmm1
+	aesenc	xmm5,xmm1
 	mov	DWORD[((32+12))+rsp],r9d
 	lea	r9,[3+r8]
-	DB	102,15,56,220,241
-	DB	102,15,56,220,249
-	DB	102,68,15,56,220,193
-	DB	102,68,15,56,220,201
+	aesenc	xmm6,xmm1
+	aesenc	xmm7,xmm1
+	aesenc	xmm8,xmm1
+	aesenc	xmm9,xmm1
 	movups	xmm1,XMMWORD[((80-128))+rcx]
 	bswap	r9d
-	DB	102,15,56,220,208
-	DB	102,15,56,220,216
+	aesenc	xmm2,xmm0
+	aesenc	xmm3,xmm0
 	xor	r9d,ebp
 	DB	0x66,0x90
-	DB	102,15,56,220,224
-	DB	102,15,56,220,232
+	aesenc	xmm4,xmm0
+	aesenc	xmm5,xmm0
 	mov	DWORD[((48+12))+rsp],r9d
 	lea	r9,[4+r8]
-	DB	102,15,56,220,240
-	DB	102,15,56,220,248
-	DB	102,68,15,56,220,192
-	DB	102,68,15,56,220,200
+	aesenc	xmm6,xmm0
+	aesenc	xmm7,xmm0
+	aesenc	xmm8,xmm0
+	aesenc	xmm9,xmm0
 	movups	xmm0,XMMWORD[((96-128))+rcx]
 	bswap	r9d
-	DB	102,15,56,220,209
-	DB	102,15,56,220,217
+	aesenc	xmm2,xmm1
+	aesenc	xmm3,xmm1
 	xor	r9d,ebp
 	DB	0x66,0x90
-	DB	102,15,56,220,225
-	DB	102,15,56,220,233
+	aesenc	xmm4,xmm1
+	aesenc	xmm5,xmm1
 	mov	DWORD[((64+12))+rsp],r9d
 	lea	r9,[5+r8]
-	DB	102,15,56,220,241
-	DB	102,15,56,220,249
-	DB	102,68,15,56,220,193
-	DB	102,68,15,56,220,201
+	aesenc	xmm6,xmm1
+	aesenc	xmm7,xmm1
+	aesenc	xmm8,xmm1
+	aesenc	xmm9,xmm1
 	movups	xmm1,XMMWORD[((112-128))+rcx]
 	bswap	r9d
-	DB	102,15,56,220,208
-	DB	102,15,56,220,216
+	aesenc	xmm2,xmm0
+	aesenc	xmm3,xmm0
 	xor	r9d,ebp
 	DB	0x66,0x90
-	DB	102,15,56,220,224
-	DB	102,15,56,220,232
+	aesenc	xmm4,xmm0
+	aesenc	xmm5,xmm0
 	mov	DWORD[((80+12))+rsp],r9d
 	lea	r9,[6+r8]
-	DB	102,15,56,220,240
-	DB	102,15,56,220,248
-	DB	102,68,15,56,220,192
-	DB	102,68,15,56,220,200
+	aesenc	xmm6,xmm0
+	aesenc	xmm7,xmm0
+	aesenc	xmm8,xmm0
+	aesenc	xmm9,xmm0
 	movups	xmm0,XMMWORD[((128-128))+rcx]
 	bswap	r9d
-	DB	102,15,56,220,209
-	DB	102,15,56,220,217
+	aesenc	xmm2,xmm1
+	aesenc	xmm3,xmm1
 	xor	r9d,ebp
 	DB	0x66,0x90
-	DB	102,15,56,220,225
-	DB	102,15,56,220,233
+	aesenc	xmm4,xmm1
+	aesenc	xmm5,xmm1
 	mov	DWORD[((96+12))+rsp],r9d
 	lea	r9,[7+r8]
-	DB	102,15,56,220,241
-	DB	102,15,56,220,249
-	DB	102,68,15,56,220,193
-	DB	102,68,15,56,220,201
+	aesenc	xmm6,xmm1
+	aesenc	xmm7,xmm1
+	aesenc	xmm8,xmm1
+	aesenc	xmm9,xmm1
 	movups	xmm1,XMMWORD[((144-128))+rcx]
 	bswap	r9d
-	DB	102,15,56,220,208
-	DB	102,15,56,220,216
-	DB	102,15,56,220,224
+	aesenc	xmm2,xmm0
+	aesenc	xmm3,xmm0
+	aesenc	xmm4,xmm0
 	xor	r9d,ebp
 	movdqu	xmm10,XMMWORD[rdi]
-	DB	102,15,56,220,232
+	aesenc	xmm5,xmm0
 	mov	DWORD[((112+12))+rsp],r9d
 	cmp	eax,11
-	DB	102,15,56,220,240
-	DB	102,15,56,220,248
-	DB	102,68,15,56,220,192
-	DB	102,68,15,56,220,200
+	aesenc	xmm6,xmm0
+	aesenc	xmm7,xmm0
+	aesenc	xmm8,xmm0
+	aesenc	xmm9,xmm0
 	movups	xmm0,XMMWORD[((160-128))+rcx]
 
 	jb	NEAR $L$ctr32_enc_done
 
-	DB	102,15,56,220,209
-	DB	102,15,56,220,217
-	DB	102,15,56,220,225
-	DB	102,15,56,220,233
-	DB	102,15,56,220,241
-	DB	102,15,56,220,249
-	DB	102,68,15,56,220,193
-	DB	102,68,15,56,220,201
+	aesenc	xmm2,xmm1
+	aesenc	xmm3,xmm1
+	aesenc	xmm4,xmm1
+	aesenc	xmm5,xmm1
+	aesenc	xmm6,xmm1
+	aesenc	xmm7,xmm1
+	aesenc	xmm8,xmm1
+	aesenc	xmm9,xmm1
 	movups	xmm1,XMMWORD[((176-128))+rcx]
 
-	DB	102,15,56,220,208
-	DB	102,15,56,220,216
-	DB	102,15,56,220,224
-	DB	102,15,56,220,232
-	DB	102,15,56,220,240
-	DB	102,15,56,220,248
-	DB	102,68,15,56,220,192
-	DB	102,68,15,56,220,200
+	aesenc	xmm2,xmm0
+	aesenc	xmm3,xmm0
+	aesenc	xmm4,xmm0
+	aesenc	xmm5,xmm0
+	aesenc	xmm6,xmm0
+	aesenc	xmm7,xmm0
+	aesenc	xmm8,xmm0
+	aesenc	xmm9,xmm0
 	movups	xmm0,XMMWORD[((192-128))+rcx]
 	je	NEAR $L$ctr32_enc_done
 
-	DB	102,15,56,220,209
-	DB	102,15,56,220,217
-	DB	102,15,56,220,225
-	DB	102,15,56,220,233
-	DB	102,15,56,220,241
-	DB	102,15,56,220,249
-	DB	102,68,15,56,220,193
-	DB	102,68,15,56,220,201
+	aesenc	xmm2,xmm1
+	aesenc	xmm3,xmm1
+	aesenc	xmm4,xmm1
+	aesenc	xmm5,xmm1
+	aesenc	xmm6,xmm1
+	aesenc	xmm7,xmm1
+	aesenc	xmm8,xmm1
+	aesenc	xmm9,xmm1
 	movups	xmm1,XMMWORD[((208-128))+rcx]
 
-	DB	102,15,56,220,208
-	DB	102,15,56,220,216
-	DB	102,15,56,220,224
-	DB	102,15,56,220,232
-	DB	102,15,56,220,240
-	DB	102,15,56,220,248
-	DB	102,68,15,56,220,192
-	DB	102,68,15,56,220,200
+	aesenc	xmm2,xmm0
+	aesenc	xmm3,xmm0
+	aesenc	xmm4,xmm0
+	aesenc	xmm5,xmm0
+	aesenc	xmm6,xmm0
+	aesenc	xmm7,xmm0
+	aesenc	xmm8,xmm0
+	aesenc	xmm9,xmm0
 	movups	xmm0,XMMWORD[((224-128))+rcx]
 	jmp	NEAR $L$ctr32_enc_done
 
@@ -1217,35 +1217,35 @@
 	prefetcht0	[448+rdi]
 	prefetcht0	[512+rdi]
 	pxor	xmm15,xmm0
-	DB	102,15,56,220,209
-	DB	102,15,56,220,217
-	DB	102,15,56,220,225
-	DB	102,15,56,220,233
-	DB	102,15,56,220,241
-	DB	102,15,56,220,249
-	DB	102,68,15,56,220,193
-	DB	102,68,15,56,220,201
+	aesenc	xmm2,xmm1
+	aesenc	xmm3,xmm1
+	aesenc	xmm4,xmm1
+	aesenc	xmm5,xmm1
+	aesenc	xmm6,xmm1
+	aesenc	xmm7,xmm1
+	aesenc	xmm8,xmm1
+	aesenc	xmm9,xmm1
 	movdqu	xmm1,XMMWORD[96+rdi]
 	lea	rdi,[128+rdi]
 
-	DB	102,65,15,56,221,210
+	aesenclast	xmm2,xmm10
 	pxor	xmm1,xmm0
 	movdqu	xmm10,XMMWORD[((112-128))+rdi]
-	DB	102,65,15,56,221,219
+	aesenclast	xmm3,xmm11
 	pxor	xmm10,xmm0
 	movdqa	xmm11,XMMWORD[rsp]
-	DB	102,65,15,56,221,228
-	DB	102,65,15,56,221,237
+	aesenclast	xmm4,xmm12
+	aesenclast	xmm5,xmm13
 	movdqa	xmm12,XMMWORD[16+rsp]
 	movdqa	xmm13,XMMWORD[32+rsp]
-	DB	102,65,15,56,221,246
-	DB	102,65,15,56,221,255
+	aesenclast	xmm6,xmm14
+	aesenclast	xmm7,xmm15
 	movdqa	xmm14,XMMWORD[48+rsp]
 	movdqa	xmm15,XMMWORD[64+rsp]
-	DB	102,68,15,56,221,193
+	aesenclast	xmm8,xmm1
 	movdqa	xmm0,XMMWORD[80+rsp]
 	movups	xmm1,XMMWORD[((16-128))+rcx]
-	DB	102,69,15,56,221,202
+	aesenclast	xmm9,xmm10
 
 	movups	XMMWORD[rsi],xmm2
 	movdqa	xmm2,xmm11
@@ -1284,19 +1284,19 @@
 	pxor	xmm9,xmm9
 
 	movups	xmm0,XMMWORD[16+rcx]
-	DB	102,15,56,220,209
-	DB	102,15,56,220,217
+	aesenc	xmm2,xmm1
+	aesenc	xmm3,xmm1
 	lea	rcx,[((32-16))+rax*1+rcx]
 	neg	rax
-	DB	102,15,56,220,225
+	aesenc	xmm4,xmm1
 	add	rax,16
 	movups	xmm10,XMMWORD[rdi]
-	DB	102,15,56,220,233
-	DB	102,15,56,220,241
+	aesenc	xmm5,xmm1
+	aesenc	xmm6,xmm1
 	movups	xmm11,XMMWORD[16+rdi]
 	movups	xmm12,XMMWORD[32+rdi]
-	DB	102,15,56,220,249
-	DB	102,68,15,56,220,193
+	aesenc	xmm7,xmm1
+	aesenc	xmm8,xmm1
 
 	call	$L$enc_loop8_enter
 
@@ -1327,20 +1327,20 @@
 
 ALIGN	32
 $L$ctr32_loop4:
-	DB	102,15,56,220,209
+	aesenc	xmm2,xmm1
 	lea	rcx,[16+rcx]
 	dec	eax
-	DB	102,15,56,220,217
-	DB	102,15,56,220,225
-	DB	102,15,56,220,233
+	aesenc	xmm3,xmm1
+	aesenc	xmm4,xmm1
+	aesenc	xmm5,xmm1
 	movups	xmm1,XMMWORD[rcx]
 	jnz	NEAR $L$ctr32_loop4
-	DB	102,15,56,221,209
-	DB	102,15,56,221,217
+	aesenclast	xmm2,xmm1
+	aesenclast	xmm3,xmm1
 	movups	xmm10,XMMWORD[rdi]
 	movups	xmm11,XMMWORD[16+rdi]
-	DB	102,15,56,221,225
-	DB	102,15,56,221,233
+	aesenclast	xmm4,xmm1
+	aesenclast	xmm5,xmm1
 	movups	xmm12,XMMWORD[32+rdi]
 	movups	xmm13,XMMWORD[48+rdi]
 
@@ -1356,16 +1356,16 @@
 
 ALIGN	32
 $L$ctr32_loop3:
-	DB	102,15,56,220,209
+	aesenc	xmm2,xmm1
 	lea	rcx,[16+rcx]
 	dec	eax
-	DB	102,15,56,220,217
-	DB	102,15,56,220,225
+	aesenc	xmm3,xmm1
+	aesenc	xmm4,xmm1
 	movups	xmm1,XMMWORD[rcx]
 	jnz	NEAR $L$ctr32_loop3
-	DB	102,15,56,221,209
-	DB	102,15,56,221,217
-	DB	102,15,56,221,225
+	aesenclast	xmm2,xmm1
+	aesenclast	xmm3,xmm1
+	aesenclast	xmm4,xmm1
 
 	movups	xmm10,XMMWORD[rdi]
 	xorps	xmm2,xmm10
@@ -1471,12 +1471,12 @@
 	lea	rcx,[32+rcx]
 	xorps	xmm2,xmm3
 $L$oop_enc1_6:
-	DB	102,15,56,220,209
+	aesenc	xmm2,xmm1
 	dec	eax
 	movups	xmm1,XMMWORD[rcx]
 	lea	rcx,[16+rcx]
 	jnz	NEAR $L$oop_enc1_6
-	DB	102,15,56,221,209
+	aesenclast	xmm2,xmm1
 	mov	eax,r10d
 	mov	rcx,r11
 	movups	XMMWORD[rsi],xmm2
@@ -1522,12 +1522,12 @@
 	lea	rcx,[32+rcx]
 	xorps	xmm2,xmm0
 $L$oop_dec1_7:
-	DB	102,15,56,222,209
+	aesdec	xmm2,xmm1
 	dec	r10d
 	movups	xmm1,XMMWORD[rcx]
 	lea	rcx,[16+rcx]
 	jnz	NEAR $L$oop_dec1_7
-	DB	102,15,56,223,209
+	aesdeclast	xmm2,xmm1
 	pxor	xmm0,xmm0
 	pxor	xmm1,xmm1
 	movdqu	XMMWORD[r8],xmm4
@@ -1597,166 +1597,166 @@
 	pxor	xmm7,xmm0
 	pxor	xmm8,xmm0
 
-	DB	102,15,56,222,209
+	aesdec	xmm2,xmm1
 	pxor	xmm9,xmm0
 	movups	xmm0,XMMWORD[((32-112))+rcx]
-	DB	102,15,56,222,217
-	DB	102,15,56,222,225
-	DB	102,15,56,222,233
-	DB	102,15,56,222,241
-	DB	102,15,56,222,249
-	DB	102,68,15,56,222,193
+	aesdec	xmm3,xmm1
+	aesdec	xmm4,xmm1
+	aesdec	xmm5,xmm1
+	aesdec	xmm6,xmm1
+	aesdec	xmm7,xmm1
+	aesdec	xmm8,xmm1
 	adc	rbp,0
 	and	rbp,128
-	DB	102,68,15,56,222,201
+	aesdec	xmm9,xmm1
 	add	rbp,rdi
 	movups	xmm1,XMMWORD[((48-112))+rcx]
-	DB	102,15,56,222,208
-	DB	102,15,56,222,216
-	DB	102,15,56,222,224
-	DB	102,15,56,222,232
-	DB	102,15,56,222,240
-	DB	102,15,56,222,248
-	DB	102,68,15,56,222,192
-	DB	102,68,15,56,222,200
+	aesdec	xmm2,xmm0
+	aesdec	xmm3,xmm0
+	aesdec	xmm4,xmm0
+	aesdec	xmm5,xmm0
+	aesdec	xmm6,xmm0
+	aesdec	xmm7,xmm0
+	aesdec	xmm8,xmm0
+	aesdec	xmm9,xmm0
 	movups	xmm0,XMMWORD[((64-112))+rcx]
 	nop
-	DB	102,15,56,222,209
-	DB	102,15,56,222,217
-	DB	102,15,56,222,225
-	DB	102,15,56,222,233
-	DB	102,15,56,222,241
-	DB	102,15,56,222,249
-	DB	102,68,15,56,222,193
-	DB	102,68,15,56,222,201
+	aesdec	xmm2,xmm1
+	aesdec	xmm3,xmm1
+	aesdec	xmm4,xmm1
+	aesdec	xmm5,xmm1
+	aesdec	xmm6,xmm1
+	aesdec	xmm7,xmm1
+	aesdec	xmm8,xmm1
+	aesdec	xmm9,xmm1
 	movups	xmm1,XMMWORD[((80-112))+rcx]
 	nop
-	DB	102,15,56,222,208
-	DB	102,15,56,222,216
-	DB	102,15,56,222,224
-	DB	102,15,56,222,232
-	DB	102,15,56,222,240
-	DB	102,15,56,222,248
-	DB	102,68,15,56,222,192
-	DB	102,68,15,56,222,200
+	aesdec	xmm2,xmm0
+	aesdec	xmm3,xmm0
+	aesdec	xmm4,xmm0
+	aesdec	xmm5,xmm0
+	aesdec	xmm6,xmm0
+	aesdec	xmm7,xmm0
+	aesdec	xmm8,xmm0
+	aesdec	xmm9,xmm0
 	movups	xmm0,XMMWORD[((96-112))+rcx]
 	nop
-	DB	102,15,56,222,209
-	DB	102,15,56,222,217
-	DB	102,15,56,222,225
-	DB	102,15,56,222,233
-	DB	102,15,56,222,241
-	DB	102,15,56,222,249
-	DB	102,68,15,56,222,193
-	DB	102,68,15,56,222,201
+	aesdec	xmm2,xmm1
+	aesdec	xmm3,xmm1
+	aesdec	xmm4,xmm1
+	aesdec	xmm5,xmm1
+	aesdec	xmm6,xmm1
+	aesdec	xmm7,xmm1
+	aesdec	xmm8,xmm1
+	aesdec	xmm9,xmm1
 	movups	xmm1,XMMWORD[((112-112))+rcx]
 	nop
-	DB	102,15,56,222,208
-	DB	102,15,56,222,216
-	DB	102,15,56,222,224
-	DB	102,15,56,222,232
-	DB	102,15,56,222,240
-	DB	102,15,56,222,248
-	DB	102,68,15,56,222,192
-	DB	102,68,15,56,222,200
+	aesdec	xmm2,xmm0
+	aesdec	xmm3,xmm0
+	aesdec	xmm4,xmm0
+	aesdec	xmm5,xmm0
+	aesdec	xmm6,xmm0
+	aesdec	xmm7,xmm0
+	aesdec	xmm8,xmm0
+	aesdec	xmm9,xmm0
 	movups	xmm0,XMMWORD[((128-112))+rcx]
 	nop
-	DB	102,15,56,222,209
-	DB	102,15,56,222,217
-	DB	102,15,56,222,225
-	DB	102,15,56,222,233
-	DB	102,15,56,222,241
-	DB	102,15,56,222,249
-	DB	102,68,15,56,222,193
-	DB	102,68,15,56,222,201
+	aesdec	xmm2,xmm1
+	aesdec	xmm3,xmm1
+	aesdec	xmm4,xmm1
+	aesdec	xmm5,xmm1
+	aesdec	xmm6,xmm1
+	aesdec	xmm7,xmm1
+	aesdec	xmm8,xmm1
+	aesdec	xmm9,xmm1
 	movups	xmm1,XMMWORD[((144-112))+rcx]
 	cmp	eax,11
-	DB	102,15,56,222,208
-	DB	102,15,56,222,216
-	DB	102,15,56,222,224
-	DB	102,15,56,222,232
-	DB	102,15,56,222,240
-	DB	102,15,56,222,248
-	DB	102,68,15,56,222,192
-	DB	102,68,15,56,222,200
+	aesdec	xmm2,xmm0
+	aesdec	xmm3,xmm0
+	aesdec	xmm4,xmm0
+	aesdec	xmm5,xmm0
+	aesdec	xmm6,xmm0
+	aesdec	xmm7,xmm0
+	aesdec	xmm8,xmm0
+	aesdec	xmm9,xmm0
 	movups	xmm0,XMMWORD[((160-112))+rcx]
 	jb	NEAR $L$cbc_dec_done
-	DB	102,15,56,222,209
-	DB	102,15,56,222,217
-	DB	102,15,56,222,225
-	DB	102,15,56,222,233
-	DB	102,15,56,222,241
-	DB	102,15,56,222,249
-	DB	102,68,15,56,222,193
-	DB	102,68,15,56,222,201
+	aesdec	xmm2,xmm1
+	aesdec	xmm3,xmm1
+	aesdec	xmm4,xmm1
+	aesdec	xmm5,xmm1
+	aesdec	xmm6,xmm1
+	aesdec	xmm7,xmm1
+	aesdec	xmm8,xmm1
+	aesdec	xmm9,xmm1
 	movups	xmm1,XMMWORD[((176-112))+rcx]
 	nop
-	DB	102,15,56,222,208
-	DB	102,15,56,222,216
-	DB	102,15,56,222,224
-	DB	102,15,56,222,232
-	DB	102,15,56,222,240
-	DB	102,15,56,222,248
-	DB	102,68,15,56,222,192
-	DB	102,68,15,56,222,200
+	aesdec	xmm2,xmm0
+	aesdec	xmm3,xmm0
+	aesdec	xmm4,xmm0
+	aesdec	xmm5,xmm0
+	aesdec	xmm6,xmm0
+	aesdec	xmm7,xmm0
+	aesdec	xmm8,xmm0
+	aesdec	xmm9,xmm0
 	movups	xmm0,XMMWORD[((192-112))+rcx]
 	je	NEAR $L$cbc_dec_done
-	DB	102,15,56,222,209
-	DB	102,15,56,222,217
-	DB	102,15,56,222,225
-	DB	102,15,56,222,233
-	DB	102,15,56,222,241
-	DB	102,15,56,222,249
-	DB	102,68,15,56,222,193
-	DB	102,68,15,56,222,201
+	aesdec	xmm2,xmm1
+	aesdec	xmm3,xmm1
+	aesdec	xmm4,xmm1
+	aesdec	xmm5,xmm1
+	aesdec	xmm6,xmm1
+	aesdec	xmm7,xmm1
+	aesdec	xmm8,xmm1
+	aesdec	xmm9,xmm1
 	movups	xmm1,XMMWORD[((208-112))+rcx]
 	nop
-	DB	102,15,56,222,208
-	DB	102,15,56,222,216
-	DB	102,15,56,222,224
-	DB	102,15,56,222,232
-	DB	102,15,56,222,240
-	DB	102,15,56,222,248
-	DB	102,68,15,56,222,192
-	DB	102,68,15,56,222,200
+	aesdec	xmm2,xmm0
+	aesdec	xmm3,xmm0
+	aesdec	xmm4,xmm0
+	aesdec	xmm5,xmm0
+	aesdec	xmm6,xmm0
+	aesdec	xmm7,xmm0
+	aesdec	xmm8,xmm0
+	aesdec	xmm9,xmm0
 	movups	xmm0,XMMWORD[((224-112))+rcx]
 	jmp	NEAR $L$cbc_dec_done
 ALIGN	16
 $L$cbc_dec_done:
-	DB	102,15,56,222,209
-	DB	102,15,56,222,217
+	aesdec	xmm2,xmm1
+	aesdec	xmm3,xmm1
 	pxor	xmm10,xmm0
 	pxor	xmm11,xmm0
-	DB	102,15,56,222,225
-	DB	102,15,56,222,233
+	aesdec	xmm4,xmm1
+	aesdec	xmm5,xmm1
 	pxor	xmm12,xmm0
 	pxor	xmm13,xmm0
-	DB	102,15,56,222,241
-	DB	102,15,56,222,249
+	aesdec	xmm6,xmm1
+	aesdec	xmm7,xmm1
 	pxor	xmm14,xmm0
 	pxor	xmm15,xmm0
-	DB	102,68,15,56,222,193
-	DB	102,68,15,56,222,201
+	aesdec	xmm8,xmm1
+	aesdec	xmm9,xmm1
 	movdqu	xmm1,XMMWORD[80+rdi]
 
-	DB	102,65,15,56,223,210
+	aesdeclast	xmm2,xmm10
 	movdqu	xmm10,XMMWORD[96+rdi]
 	pxor	xmm1,xmm0
-	DB	102,65,15,56,223,219
+	aesdeclast	xmm3,xmm11
 	pxor	xmm10,xmm0
 	movdqu	xmm0,XMMWORD[112+rdi]
-	DB	102,65,15,56,223,228
+	aesdeclast	xmm4,xmm12
 	lea	rdi,[128+rdi]
 	movdqu	xmm11,XMMWORD[rbp]
-	DB	102,65,15,56,223,237
-	DB	102,65,15,56,223,246
+	aesdeclast	xmm5,xmm13
+	aesdeclast	xmm6,xmm14
 	movdqu	xmm12,XMMWORD[16+rbp]
 	movdqu	xmm13,XMMWORD[32+rbp]
-	DB	102,65,15,56,223,255
-	DB	102,68,15,56,223,193
+	aesdeclast	xmm7,xmm15
+	aesdeclast	xmm8,xmm1
 	movdqu	xmm14,XMMWORD[48+rbp]
 	movdqu	xmm15,XMMWORD[64+rbp]
-	DB	102,69,15,56,223,202
+	aesdeclast	xmm9,xmm10
 	movdqa	xmm10,xmm0
 	movdqu	xmm1,XMMWORD[80+rbp]
 	movups	xmm0,XMMWORD[((-112))+rcx]
@@ -1900,12 +1900,12 @@
 	lea	rcx,[32+rcx]
 	xorps	xmm2,xmm0
 $L$oop_dec1_8:
-	DB	102,15,56,222,209
+	aesdec	xmm2,xmm1
 	dec	eax
 	movups	xmm1,XMMWORD[rcx]
 	lea	rcx,[16+rcx]
 	jnz	NEAR $L$oop_dec1_8
-	DB	102,15,56,223,209
+	aesdeclast	xmm2,xmm1
 	xorps	xmm2,xmm10
 	movaps	xmm10,xmm11
 	jmp	NEAR $L$cbc_dec_tail_collected
@@ -2033,8 +2033,8 @@
 $L$dec_key_inverse:
 	movups	xmm0,XMMWORD[rcx]
 	movups	xmm1,XMMWORD[r8]
-	DB	102,15,56,219,192
-	DB	102,15,56,219,201
+	aesimc	xmm0,xmm0
+	aesimc	xmm1,xmm1
 	lea	rcx,[16+rcx]
 	lea	r8,[((-16))+r8]
 	movups	XMMWORD[16+r8],xmm0
@@ -2043,7 +2043,7 @@
 	ja	NEAR $L$dec_key_inverse
 
 	movups	xmm0,XMMWORD[rcx]
-	DB	102,15,56,219,192
+	aesimc	xmm0,xmm0
 	pxor	xmm1,xmm1
 	movups	XMMWORD[r8],xmm0
 	pxor	xmm0,xmm0
@@ -2078,25 +2078,25 @@
 	mov	edx,9
 
 	movups	XMMWORD[r8],xmm0
-	DB	102,15,58,223,200,1
+	aeskeygenassist	xmm1,xmm0,0x1
 	call	$L$key_expansion_128_cold
-	DB	102,15,58,223,200,2
+	aeskeygenassist	xmm1,xmm0,0x2
 	call	$L$key_expansion_128
-	DB	102,15,58,223,200,4
+	aeskeygenassist	xmm1,xmm0,0x4
 	call	$L$key_expansion_128
-	DB	102,15,58,223,200,8
+	aeskeygenassist	xmm1,xmm0,0x8
 	call	$L$key_expansion_128
-	DB	102,15,58,223,200,16
+	aeskeygenassist	xmm1,xmm0,0x10
 	call	$L$key_expansion_128
-	DB	102,15,58,223,200,32
+	aeskeygenassist	xmm1,xmm0,0x20
 	call	$L$key_expansion_128
-	DB	102,15,58,223,200,64
+	aeskeygenassist	xmm1,xmm0,0x40
 	call	$L$key_expansion_128
-	DB	102,15,58,223,200,128
+	aeskeygenassist	xmm1,xmm0,0x80
 	call	$L$key_expansion_128
-	DB	102,15,58,223,200,27
+	aeskeygenassist	xmm1,xmm0,0x1b
 	call	$L$key_expansion_128
-	DB	102,15,58,223,200,54
+	aeskeygenassist	xmm1,xmm0,0x36
 	call	$L$key_expansion_128
 	movups	XMMWORD[rax],xmm0
 	mov	DWORD[80+rax],edx
@@ -2109,21 +2109,21 @@
 	mov	edx,11
 
 	movups	XMMWORD[r8],xmm0
-	DB	102,15,58,223,202,1
+	aeskeygenassist	xmm1,xmm2,0x1
 	call	$L$key_expansion_192a_cold
-	DB	102,15,58,223,202,2
+	aeskeygenassist	xmm1,xmm2,0x2
 	call	$L$key_expansion_192b
-	DB	102,15,58,223,202,4
+	aeskeygenassist	xmm1,xmm2,0x4
 	call	$L$key_expansion_192a
-	DB	102,15,58,223,202,8
+	aeskeygenassist	xmm1,xmm2,0x8
 	call	$L$key_expansion_192b
-	DB	102,15,58,223,202,16
+	aeskeygenassist	xmm1,xmm2,0x10
 	call	$L$key_expansion_192a
-	DB	102,15,58,223,202,32
+	aeskeygenassist	xmm1,xmm2,0x20
 	call	$L$key_expansion_192b
-	DB	102,15,58,223,202,64
+	aeskeygenassist	xmm1,xmm2,0x40
 	call	$L$key_expansion_192a
-	DB	102,15,58,223,202,128
+	aeskeygenassist	xmm1,xmm2,0x80
 	call	$L$key_expansion_192b
 	movups	XMMWORD[rax],xmm0
 	mov	DWORD[48+rax],edx
@@ -2138,31 +2138,31 @@
 
 	movups	XMMWORD[r8],xmm0
 	movups	XMMWORD[16+r8],xmm2
-	DB	102,15,58,223,202,1
+	aeskeygenassist	xmm1,xmm2,0x1
 	call	$L$key_expansion_256a_cold
-	DB	102,15,58,223,200,1
+	aeskeygenassist	xmm1,xmm0,0x1
 	call	$L$key_expansion_256b
-	DB	102,15,58,223,202,2
+	aeskeygenassist	xmm1,xmm2,0x2
 	call	$L$key_expansion_256a
-	DB	102,15,58,223,200,2
+	aeskeygenassist	xmm1,xmm0,0x2
 	call	$L$key_expansion_256b
-	DB	102,15,58,223,202,4
+	aeskeygenassist	xmm1,xmm2,0x4
 	call	$L$key_expansion_256a
-	DB	102,15,58,223,200,4
+	aeskeygenassist	xmm1,xmm0,0x4
 	call	$L$key_expansion_256b
-	DB	102,15,58,223,202,8
+	aeskeygenassist	xmm1,xmm2,0x8
 	call	$L$key_expansion_256a
-	DB	102,15,58,223,200,8
+	aeskeygenassist	xmm1,xmm0,0x8
 	call	$L$key_expansion_256b
-	DB	102,15,58,223,202,16
+	aeskeygenassist	xmm1,xmm2,0x10
 	call	$L$key_expansion_256a
-	DB	102,15,58,223,200,16
+	aeskeygenassist	xmm1,xmm0,0x10
 	call	$L$key_expansion_256b
-	DB	102,15,58,223,202,32
+	aeskeygenassist	xmm1,xmm2,0x20
 	call	$L$key_expansion_256a
-	DB	102,15,58,223,200,32
+	aeskeygenassist	xmm1,xmm0,0x20
 	call	$L$key_expansion_256b
-	DB	102,15,58,223,202,64
+	aeskeygenassist	xmm1,xmm2,0x40
 	call	$L$key_expansion_256a
 	movups	XMMWORD[rax],xmm0
 	mov	DWORD[16+rax],edx
@@ -2299,8 +2299,8 @@
 
 ALIGN	16
 $L$oop_key128:
-DB	102,15,56,0,197
-	DB	102,15,56,221,196
+	pshufb	xmm0,xmm5
+	aesenclast	xmm0,xmm4
 	pslld	xmm4,1
 	lea	rax,[16+rax]
 
@@ -2321,8 +2321,8 @@
 
 	movdqa	xmm4,XMMWORD[$L$key_rcon1b]
 
-DB	102,15,56,0,197
-	DB	102,15,56,221,196
+	pshufb	xmm0,xmm5
+	aesenclast	xmm0,xmm4
 	pslld	xmm4,1
 
 	movdqa	xmm3,xmm2
@@ -2337,8 +2337,8 @@
 	movdqu	XMMWORD[rax],xmm0
 
 	movdqa	xmm2,xmm0
-DB	102,15,56,0,197
-	DB	102,15,56,221,196
+	pshufb	xmm0,xmm5
+	aesenclast	xmm0,xmm4
 
 	movdqa	xmm3,xmm2
 	pslldq	xmm2,4
@@ -2369,8 +2369,8 @@
 $L$oop_key192:
 	movq	QWORD[rax],xmm2
 	movdqa	xmm1,xmm2
-DB	102,15,56,0,213
-	DB	102,15,56,221,212
+	pshufb	xmm2,xmm5
+	aesenclast	xmm2,xmm4
 	pslld	xmm4,1
 	lea	rax,[24+rax]
 
@@ -2413,8 +2413,8 @@
 
 ALIGN	16
 $L$oop_key256:
-DB	102,15,56,0,213
-	DB	102,15,56,221,212
+	pshufb	xmm2,xmm5
+	aesenclast	xmm2,xmm4
 
 	movdqa	xmm3,xmm0
 	pslldq	xmm0,4
@@ -2433,7 +2433,7 @@
 
 	pshufd	xmm2,xmm0,0xff
 	pxor	xmm3,xmm3
-	DB	102,15,56,221,211
+	aesenclast	xmm2,xmm3
 
 	movdqa	xmm3,xmm1
 	pslldq	xmm1,4
diff --git a/gen/bcm/ghash-ssse3-x86_64-apple.S b/gen/bcm/ghash-ssse3-x86_64-apple.S
index 651cca3..53af23f 100644
--- a/gen/bcm/ghash-ssse3-x86_64-apple.S
+++ b/gen/bcm/ghash-ssse3-x86_64-apple.S
@@ -23,7 +23,7 @@
 	movdqa	L$low4_mask(%rip),%xmm2
 
 
-.byte	102,65,15,56,0,194
+	pshufb	%xmm10,%xmm0
 
 
 	movdqa	%xmm2,%xmm1
@@ -43,7 +43,7 @@
 
 
 	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
+	palignr	$1,%xmm3,%xmm6
 	movdqa	%xmm6,%xmm3
 	psrldq	$1,%xmm2
 
@@ -51,8 +51,8 @@
 
 
 	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
+	pshufb	%xmm0,%xmm4
+	pshufb	%xmm1,%xmm5
 
 
 	pxor	%xmm5,%xmm2
@@ -91,7 +91,7 @@
 
 
 	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
+	palignr	$1,%xmm3,%xmm6
 	movdqa	%xmm6,%xmm3
 	psrldq	$1,%xmm2
 
@@ -99,8 +99,8 @@
 
 
 	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
+	pshufb	%xmm0,%xmm4
+	pshufb	%xmm1,%xmm5
 
 
 	pxor	%xmm5,%xmm2
@@ -139,7 +139,7 @@
 
 
 	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
+	palignr	$1,%xmm3,%xmm6
 	movdqa	%xmm6,%xmm3
 	psrldq	$1,%xmm2
 
@@ -147,8 +147,8 @@
 
 
 	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
+	pshufb	%xmm0,%xmm4
+	pshufb	%xmm1,%xmm5
 
 
 	pxor	%xmm5,%xmm2
@@ -181,7 +181,7 @@
 	pxor	%xmm3,%xmm2
 	pxor	%xmm3,%xmm3
 
-.byte	102,65,15,56,0,210
+	pshufb	%xmm10,%xmm2
 	movdqu	%xmm2,(%rdi)
 
 
@@ -218,14 +218,14 @@
 
 
 
-.byte	102,65,15,56,0,194
+	pshufb	%xmm10,%xmm0
 
 
 	pxor	%xmm3,%xmm3
 L$oop_ghash:
 
 	movdqu	(%rdx),%xmm1
-.byte	102,65,15,56,0,202
+	pshufb	%xmm10,%xmm1
 	pxor	%xmm1,%xmm0
 
 
@@ -246,7 +246,7 @@
 
 
 	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
+	palignr	$1,%xmm3,%xmm6
 	movdqa	%xmm6,%xmm3
 	psrldq	$1,%xmm2
 
@@ -254,8 +254,8 @@
 
 
 	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
+	pshufb	%xmm0,%xmm4
+	pshufb	%xmm1,%xmm5
 
 
 	pxor	%xmm5,%xmm2
@@ -294,7 +294,7 @@
 
 
 	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
+	palignr	$1,%xmm3,%xmm6
 	movdqa	%xmm6,%xmm3
 	psrldq	$1,%xmm2
 
@@ -302,8 +302,8 @@
 
 
 	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
+	pshufb	%xmm0,%xmm4
+	pshufb	%xmm1,%xmm5
 
 
 	pxor	%xmm5,%xmm2
@@ -342,7 +342,7 @@
 
 
 	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
+	palignr	$1,%xmm3,%xmm6
 	movdqa	%xmm6,%xmm3
 	psrldq	$1,%xmm2
 
@@ -350,8 +350,8 @@
 
 
 	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
+	pshufb	%xmm0,%xmm4
+	pshufb	%xmm1,%xmm5
 
 
 	pxor	%xmm5,%xmm2
@@ -394,7 +394,7 @@
 	jnz	L$oop_ghash
 
 
-.byte	102,65,15,56,0,194
+	pshufb	%xmm10,%xmm0
 	movdqu	%xmm0,(%rdi)
 
 
diff --git a/gen/bcm/ghash-ssse3-x86_64-linux.S b/gen/bcm/ghash-ssse3-x86_64-linux.S
index 84ac20a..edce38d 100644
--- a/gen/bcm/ghash-ssse3-x86_64-linux.S
+++ b/gen/bcm/ghash-ssse3-x86_64-linux.S
@@ -23,7 +23,7 @@
 	movdqa	.Llow4_mask(%rip),%xmm2
 
 
-.byte	102,65,15,56,0,194
+	pshufb	%xmm10,%xmm0
 
 
 	movdqa	%xmm2,%xmm1
@@ -43,7 +43,7 @@
 
 
 	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
+	palignr	$1,%xmm3,%xmm6
 	movdqa	%xmm6,%xmm3
 	psrldq	$1,%xmm2
 
@@ -51,8 +51,8 @@
 
 
 	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
+	pshufb	%xmm0,%xmm4
+	pshufb	%xmm1,%xmm5
 
 
 	pxor	%xmm5,%xmm2
@@ -91,7 +91,7 @@
 
 
 	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
+	palignr	$1,%xmm3,%xmm6
 	movdqa	%xmm6,%xmm3
 	psrldq	$1,%xmm2
 
@@ -99,8 +99,8 @@
 
 
 	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
+	pshufb	%xmm0,%xmm4
+	pshufb	%xmm1,%xmm5
 
 
 	pxor	%xmm5,%xmm2
@@ -139,7 +139,7 @@
 
 
 	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
+	palignr	$1,%xmm3,%xmm6
 	movdqa	%xmm6,%xmm3
 	psrldq	$1,%xmm2
 
@@ -147,8 +147,8 @@
 
 
 	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
+	pshufb	%xmm0,%xmm4
+	pshufb	%xmm1,%xmm5
 
 
 	pxor	%xmm5,%xmm2
@@ -181,7 +181,7 @@
 	pxor	%xmm3,%xmm2
 	pxor	%xmm3,%xmm3
 
-.byte	102,65,15,56,0,210
+	pshufb	%xmm10,%xmm2
 	movdqu	%xmm2,(%rdi)
 
 
@@ -218,14 +218,14 @@
 
 
 
-.byte	102,65,15,56,0,194
+	pshufb	%xmm10,%xmm0
 
 
 	pxor	%xmm3,%xmm3
 .Loop_ghash:
 
 	movdqu	(%rdx),%xmm1
-.byte	102,65,15,56,0,202
+	pshufb	%xmm10,%xmm1
 	pxor	%xmm1,%xmm0
 
 
@@ -246,7 +246,7 @@
 
 
 	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
+	palignr	$1,%xmm3,%xmm6
 	movdqa	%xmm6,%xmm3
 	psrldq	$1,%xmm2
 
@@ -254,8 +254,8 @@
 
 
 	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
+	pshufb	%xmm0,%xmm4
+	pshufb	%xmm1,%xmm5
 
 
 	pxor	%xmm5,%xmm2
@@ -294,7 +294,7 @@
 
 
 	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
+	palignr	$1,%xmm3,%xmm6
 	movdqa	%xmm6,%xmm3
 	psrldq	$1,%xmm2
 
@@ -302,8 +302,8 @@
 
 
 	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
+	pshufb	%xmm0,%xmm4
+	pshufb	%xmm1,%xmm5
 
 
 	pxor	%xmm5,%xmm2
@@ -342,7 +342,7 @@
 
 
 	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
+	palignr	$1,%xmm3,%xmm6
 	movdqa	%xmm6,%xmm3
 	psrldq	$1,%xmm2
 
@@ -350,8 +350,8 @@
 
 
 	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
+	pshufb	%xmm0,%xmm4
+	pshufb	%xmm1,%xmm5
 
 
 	pxor	%xmm5,%xmm2
@@ -394,7 +394,7 @@
 	jnz	.Loop_ghash
 
 
-.byte	102,65,15,56,0,194
+	pshufb	%xmm10,%xmm0
 	movdqu	%xmm0,(%rdi)
 
 
diff --git a/gen/bcm/ghash-ssse3-x86_64-win.asm b/gen/bcm/ghash-ssse3-x86_64-win.asm
index c00e039..5bcd094 100644
--- a/gen/bcm/ghash-ssse3-x86_64-win.asm
+++ b/gen/bcm/ghash-ssse3-x86_64-win.asm
@@ -37,7 +37,7 @@
 	movdqa	xmm2,XMMWORD[$L$low4_mask]
 
 
-DB	102,65,15,56,0,194
+	pshufb	xmm0,xmm10
 
 
 	movdqa	xmm1,xmm2
@@ -57,7 +57,7 @@
 
 
 	movdqa	xmm6,xmm2
-DB	102,15,58,15,243,1
+	palignr	xmm6,xmm3,1
 	movdqa	xmm3,xmm6
 	psrldq	xmm2,1
 
@@ -65,8 +65,8 @@
 
 
 	movdqa	xmm5,xmm4
-DB	102,15,56,0,224
-DB	102,15,56,0,233
+	pshufb	xmm4,xmm0
+	pshufb	xmm5,xmm1
 
 
 	pxor	xmm2,xmm5
@@ -105,7 +105,7 @@
 
 
 	movdqa	xmm6,xmm2
-DB	102,15,58,15,243,1
+	palignr	xmm6,xmm3,1
 	movdqa	xmm3,xmm6
 	psrldq	xmm2,1
 
@@ -113,8 +113,8 @@
 
 
 	movdqa	xmm5,xmm4
-DB	102,15,56,0,224
-DB	102,15,56,0,233
+	pshufb	xmm4,xmm0
+	pshufb	xmm5,xmm1
 
 
 	pxor	xmm2,xmm5
@@ -153,7 +153,7 @@
 
 
 	movdqa	xmm6,xmm2
-DB	102,15,58,15,243,1
+	palignr	xmm6,xmm3,1
 	movdqa	xmm3,xmm6
 	psrldq	xmm2,1
 
@@ -161,8 +161,8 @@
 
 
 	movdqa	xmm5,xmm4
-DB	102,15,56,0,224
-DB	102,15,56,0,233
+	pshufb	xmm4,xmm0
+	pshufb	xmm5,xmm1
 
 
 	pxor	xmm2,xmm5
@@ -195,7 +195,7 @@
 	pxor	xmm2,xmm3
 	pxor	xmm3,xmm3
 
-DB	102,65,15,56,0,210
+	pshufb	xmm2,xmm10
 	movdqu	XMMWORD[rcx],xmm2
 
 
@@ -243,14 +243,14 @@
 
 
 
-DB	102,65,15,56,0,194
+	pshufb	xmm0,xmm10
 
 
 	pxor	xmm3,xmm3
 $L$oop_ghash:
 
 	movdqu	xmm1,XMMWORD[r8]
-DB	102,65,15,56,0,202
+	pshufb	xmm1,xmm10
 	pxor	xmm0,xmm1
 
 
@@ -271,7 +271,7 @@
 
 
 	movdqa	xmm6,xmm2
-DB	102,15,58,15,243,1
+	palignr	xmm6,xmm3,1
 	movdqa	xmm3,xmm6
 	psrldq	xmm2,1
 
@@ -279,8 +279,8 @@
 
 
 	movdqa	xmm5,xmm4
-DB	102,15,56,0,224
-DB	102,15,56,0,233
+	pshufb	xmm4,xmm0
+	pshufb	xmm5,xmm1
 
 
 	pxor	xmm2,xmm5
@@ -319,7 +319,7 @@
 
 
 	movdqa	xmm6,xmm2
-DB	102,15,58,15,243,1
+	palignr	xmm6,xmm3,1
 	movdqa	xmm3,xmm6
 	psrldq	xmm2,1
 
@@ -327,8 +327,8 @@
 
 
 	movdqa	xmm5,xmm4
-DB	102,15,56,0,224
-DB	102,15,56,0,233
+	pshufb	xmm4,xmm0
+	pshufb	xmm5,xmm1
 
 
 	pxor	xmm2,xmm5
@@ -367,7 +367,7 @@
 
 
 	movdqa	xmm6,xmm2
-DB	102,15,58,15,243,1
+	palignr	xmm6,xmm3,1
 	movdqa	xmm3,xmm6
 	psrldq	xmm2,1
 
@@ -375,8 +375,8 @@
 
 
 	movdqa	xmm5,xmm4
-DB	102,15,56,0,224
-DB	102,15,56,0,233
+	pshufb	xmm4,xmm0
+	pshufb	xmm5,xmm1
 
 
 	pxor	xmm2,xmm5
@@ -419,7 +419,7 @@
 	jnz	NEAR $L$oop_ghash
 
 
-DB	102,65,15,56,0,194
+	pshufb	xmm0,xmm10
 	movdqu	XMMWORD[rcx],xmm0
 
 
diff --git a/gen/bcm/ghash-x86_64-apple.S b/gen/bcm/ghash-x86_64-apple.S
index 4961298..0cf60d1 100644
--- a/gen/bcm/ghash-x86_64-apple.S
+++ b/gen/bcm/ghash-x86_64-apple.S
@@ -38,9 +38,9 @@
 	movdqa	%xmm0,%xmm1
 	pshufd	$78,%xmm0,%xmm3
 	pxor	%xmm0,%xmm3
-.byte	102,15,58,68,194,0
-.byte	102,15,58,68,202,17
-.byte	102,15,58,68,222,0
+	pclmulqdq	$0x00,%xmm2,%xmm0
+	pclmulqdq	$0x11,%xmm2,%xmm1
+	pclmulqdq	$0x00,%xmm6,%xmm3
 	pxor	%xmm0,%xmm3
 	pxor	%xmm1,%xmm3
 
@@ -78,14 +78,14 @@
 	movdqu	%xmm2,0(%rdi)
 	pxor	%xmm0,%xmm4
 	movdqu	%xmm0,16(%rdi)
-.byte	102,15,58,15,227,8
+	palignr	$8,%xmm3,%xmm4
 	movdqu	%xmm4,32(%rdi)
 	movdqa	%xmm0,%xmm1
 	pshufd	$78,%xmm0,%xmm3
 	pxor	%xmm0,%xmm3
-.byte	102,15,58,68,194,0
-.byte	102,15,58,68,202,17
-.byte	102,15,58,68,222,0
+	pclmulqdq	$0x00,%xmm2,%xmm0
+	pclmulqdq	$0x11,%xmm2,%xmm1
+	pclmulqdq	$0x00,%xmm6,%xmm3
 	pxor	%xmm0,%xmm3
 	pxor	%xmm1,%xmm3
 
@@ -121,9 +121,9 @@
 	movdqa	%xmm0,%xmm1
 	pshufd	$78,%xmm0,%xmm3
 	pxor	%xmm0,%xmm3
-.byte	102,15,58,68,194,0
-.byte	102,15,58,68,202,17
-.byte	102,15,58,68,222,0
+	pclmulqdq	$0x00,%xmm2,%xmm0
+	pclmulqdq	$0x11,%xmm2,%xmm1
+	pclmulqdq	$0x00,%xmm6,%xmm3
 	pxor	%xmm0,%xmm3
 	pxor	%xmm1,%xmm3
 
@@ -161,7 +161,7 @@
 	movdqu	%xmm5,48(%rdi)
 	pxor	%xmm0,%xmm4
 	movdqu	%xmm0,64(%rdi)
-.byte	102,15,58,15,227,8
+	palignr	$8,%xmm3,%xmm4
 	movdqu	%xmm4,80(%rdi)
 	ret
 
@@ -179,13 +179,13 @@
 	movdqa	L$bswap_mask(%rip),%xmm5
 	movdqu	(%rsi),%xmm2
 	movdqu	32(%rsi),%xmm4
-.byte	102,15,56,0,197
+	pshufb	%xmm5,%xmm0
 	movdqa	%xmm0,%xmm1
 	pshufd	$78,%xmm0,%xmm3
 	pxor	%xmm0,%xmm3
-.byte	102,15,58,68,194,0
-.byte	102,15,58,68,202,17
-.byte	102,15,58,68,220,0
+	pclmulqdq	$0x00,%xmm2,%xmm0
+	pclmulqdq	$0x11,%xmm2,%xmm1
+	pclmulqdq	$0x00,%xmm4,%xmm3
 	pxor	%xmm0,%xmm3
 	pxor	%xmm1,%xmm3
 
@@ -217,7 +217,7 @@
 	pxor	%xmm4,%xmm0
 	psrlq	$1,%xmm0
 	pxor	%xmm1,%xmm0
-.byte	102,15,56,0,197
+	pshufb	%xmm5,%xmm0
 	movdqu	%xmm0,(%rdi)
 	ret
 
@@ -236,7 +236,7 @@
 	movdqu	(%rdi),%xmm0
 	movdqu	(%rsi),%xmm2
 	movdqu	32(%rsi),%xmm7
-.byte	102,65,15,56,0,194
+	pshufb	%xmm10,%xmm0
 
 	subq	$0x10,%rcx
 	jz	L$odd_tail
@@ -255,21 +255,21 @@
 
 	movdqu	48(%rdx),%xmm3
 	movdqu	32(%rdx),%xmm11
-.byte	102,65,15,56,0,218
-.byte	102,69,15,56,0,218
+	pshufb	%xmm10,%xmm3
+	pshufb	%xmm10,%xmm11
 	movdqa	%xmm3,%xmm5
 	pshufd	$78,%xmm3,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,68,218,0
-.byte	102,15,58,68,234,17
-.byte	102,15,58,68,231,0
+	pclmulqdq	$0x00,%xmm2,%xmm3
+	pclmulqdq	$0x11,%xmm2,%xmm5
+	pclmulqdq	$0x00,%xmm7,%xmm4
 
 	movdqa	%xmm11,%xmm13
 	pshufd	$78,%xmm11,%xmm12
 	pxor	%xmm11,%xmm12
-.byte	102,68,15,58,68,222,0
-.byte	102,68,15,58,68,238,17
-.byte	102,68,15,58,68,231,16
+	pclmulqdq	$0x00,%xmm6,%xmm11
+	pclmulqdq	$0x11,%xmm6,%xmm13
+	pclmulqdq	$0x10,%xmm7,%xmm12
 	xorps	%xmm11,%xmm3
 	xorps	%xmm13,%xmm5
 	movups	80(%rsi),%xmm7
@@ -277,18 +277,18 @@
 
 	movdqu	16(%rdx),%xmm11
 	movdqu	0(%rdx),%xmm8
-.byte	102,69,15,56,0,218
-.byte	102,69,15,56,0,194
+	pshufb	%xmm10,%xmm11
+	pshufb	%xmm10,%xmm8
 	movdqa	%xmm11,%xmm13
 	pshufd	$78,%xmm11,%xmm12
 	pxor	%xmm8,%xmm0
 	pxor	%xmm11,%xmm12
-.byte	102,69,15,58,68,222,0
+	pclmulqdq	$0x00,%xmm14,%xmm11
 	movdqa	%xmm0,%xmm1
 	pshufd	$78,%xmm0,%xmm8
 	pxor	%xmm0,%xmm8
-.byte	102,69,15,58,68,238,17
-.byte	102,68,15,58,68,231,0
+	pclmulqdq	$0x11,%xmm14,%xmm13
+	pclmulqdq	$0x00,%xmm7,%xmm12
 	xorps	%xmm11,%xmm3
 	xorps	%xmm13,%xmm5
 
@@ -299,22 +299,22 @@
 	jmp	L$mod4_loop
 .p2align	5
 L$mod4_loop:
-.byte	102,65,15,58,68,199,0
+	pclmulqdq	$0x00,%xmm15,%xmm0
 	xorps	%xmm12,%xmm4
 	movdqu	48(%rdx),%xmm11
-.byte	102,69,15,56,0,218
-.byte	102,65,15,58,68,207,17
+	pshufb	%xmm10,%xmm11
+	pclmulqdq	$0x11,%xmm15,%xmm1
 	xorps	%xmm3,%xmm0
 	movdqu	32(%rdx),%xmm3
 	movdqa	%xmm11,%xmm13
-.byte	102,68,15,58,68,199,16
+	pclmulqdq	$0x10,%xmm7,%xmm8
 	pshufd	$78,%xmm11,%xmm12
 	xorps	%xmm5,%xmm1
 	pxor	%xmm11,%xmm12
-.byte	102,65,15,56,0,218
+	pshufb	%xmm10,%xmm3
 	movups	32(%rsi),%xmm7
 	xorps	%xmm4,%xmm8
-.byte	102,68,15,58,68,218,0
+	pclmulqdq	$0x00,%xmm2,%xmm11
 	pshufd	$78,%xmm3,%xmm4
 
 	pxor	%xmm0,%xmm8
@@ -322,22 +322,22 @@
 	pxor	%xmm1,%xmm8
 	pxor	%xmm3,%xmm4
 	movdqa	%xmm8,%xmm9
-.byte	102,68,15,58,68,234,17
+	pclmulqdq	$0x11,%xmm2,%xmm13
 	pslldq	$8,%xmm8
 	psrldq	$8,%xmm9
 	pxor	%xmm8,%xmm0
 	movdqa	L$7_mask(%rip),%xmm8
 	pxor	%xmm9,%xmm1
-.byte	102,76,15,110,200
+	movq	%rax,%xmm9
 
 	pand	%xmm0,%xmm8
-.byte	102,69,15,56,0,200
+	pshufb	%xmm8,%xmm9
 	pxor	%xmm0,%xmm9
-.byte	102,68,15,58,68,231,0
+	pclmulqdq	$0x00,%xmm7,%xmm12
 	psllq	$57,%xmm9
 	movdqa	%xmm9,%xmm8
 	pslldq	$8,%xmm9
-.byte	102,15,58,68,222,0
+	pclmulqdq	$0x00,%xmm6,%xmm3
 	psrldq	$8,%xmm8
 	pxor	%xmm9,%xmm0
 	pxor	%xmm8,%xmm1
@@ -345,14 +345,14 @@
 
 	movdqa	%xmm0,%xmm9
 	psrlq	$1,%xmm0
-.byte	102,15,58,68,238,17
+	pclmulqdq	$0x11,%xmm6,%xmm5
 	xorps	%xmm11,%xmm3
 	movdqu	16(%rdx),%xmm11
-.byte	102,69,15,56,0,218
-.byte	102,15,58,68,231,16
+	pshufb	%xmm10,%xmm11
+	pclmulqdq	$0x10,%xmm7,%xmm4
 	xorps	%xmm13,%xmm5
 	movups	80(%rsi),%xmm7
-.byte	102,69,15,56,0,194
+	pshufb	%xmm10,%xmm8
 	pxor	%xmm9,%xmm1
 	pxor	%xmm0,%xmm9
 	psrlq	$5,%xmm0
@@ -363,16 +363,16 @@
 	pxor	%xmm9,%xmm0
 	pxor	%xmm8,%xmm1
 	pxor	%xmm11,%xmm12
-.byte	102,69,15,58,68,222,0
+	pclmulqdq	$0x00,%xmm14,%xmm11
 	psrlq	$1,%xmm0
 	pxor	%xmm1,%xmm0
 	movdqa	%xmm0,%xmm1
-.byte	102,69,15,58,68,238,17
+	pclmulqdq	$0x11,%xmm14,%xmm13
 	xorps	%xmm11,%xmm3
 	pshufd	$78,%xmm0,%xmm8
 	pxor	%xmm0,%xmm8
 
-.byte	102,68,15,58,68,231,0
+	pclmulqdq	$0x00,%xmm7,%xmm12
 	xorps	%xmm13,%xmm5
 
 	leaq	64(%rdx),%rdx
@@ -380,9 +380,9 @@
 	jnc	L$mod4_loop
 
 L$tail4x:
-.byte	102,65,15,58,68,199,0
-.byte	102,65,15,58,68,207,17
-.byte	102,68,15,58,68,199,16
+	pclmulqdq	$0x00,%xmm15,%xmm0
+	pclmulqdq	$0x11,%xmm15,%xmm1
+	pclmulqdq	$0x10,%xmm7,%xmm8
 	xorps	%xmm12,%xmm4
 	xorps	%xmm3,%xmm0
 	xorps	%xmm5,%xmm1
@@ -433,16 +433,16 @@
 
 	movdqu	(%rdx),%xmm8
 	movdqu	16(%rdx),%xmm3
-.byte	102,69,15,56,0,194
-.byte	102,65,15,56,0,218
+	pshufb	%xmm10,%xmm8
+	pshufb	%xmm10,%xmm3
 	pxor	%xmm8,%xmm0
 
 	movdqa	%xmm3,%xmm5
 	pshufd	$78,%xmm3,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,68,218,0
-.byte	102,15,58,68,234,17
-.byte	102,15,58,68,231,0
+	pclmulqdq	$0x00,%xmm2,%xmm3
+	pclmulqdq	$0x11,%xmm2,%xmm5
+	pclmulqdq	$0x00,%xmm7,%xmm4
 
 	leaq	32(%rdx),%rdx
 	nop
@@ -458,21 +458,21 @@
 	pshufd	$78,%xmm0,%xmm4
 	pxor	%xmm0,%xmm4
 
-.byte	102,15,58,68,198,0
-.byte	102,15,58,68,206,17
-.byte	102,15,58,68,231,16
+	pclmulqdq	$0x00,%xmm6,%xmm0
+	pclmulqdq	$0x11,%xmm6,%xmm1
+	pclmulqdq	$0x10,%xmm7,%xmm4
 
 	pxor	%xmm3,%xmm0
 	pxor	%xmm5,%xmm1
 	movdqu	(%rdx),%xmm9
 	pxor	%xmm0,%xmm8
-.byte	102,69,15,56,0,202
+	pshufb	%xmm10,%xmm9
 	movdqu	16(%rdx),%xmm3
 
 	pxor	%xmm1,%xmm8
 	pxor	%xmm9,%xmm1
 	pxor	%xmm8,%xmm4
-.byte	102,65,15,56,0,218
+	pshufb	%xmm10,%xmm3
 	movdqa	%xmm4,%xmm8
 	psrldq	$8,%xmm8
 	pslldq	$8,%xmm4
@@ -485,7 +485,7 @@
 	movdqa	%xmm0,%xmm8
 	psllq	$5,%xmm0
 	pxor	%xmm0,%xmm8
-.byte	102,15,58,68,218,0
+	pclmulqdq	$0x00,%xmm2,%xmm3
 	psllq	$1,%xmm0
 	pxor	%xmm8,%xmm0
 	psllq	$57,%xmm0
@@ -499,14 +499,14 @@
 
 	movdqa	%xmm0,%xmm9
 	psrlq	$1,%xmm0
-.byte	102,15,58,68,234,17
+	pclmulqdq	$0x11,%xmm2,%xmm5
 	pxor	%xmm9,%xmm1
 	pxor	%xmm0,%xmm9
 	psrlq	$5,%xmm0
 	pxor	%xmm9,%xmm0
 	leaq	32(%rdx),%rdx
 	psrlq	$1,%xmm0
-.byte	102,15,58,68,231,0
+	pclmulqdq	$0x00,%xmm7,%xmm4
 	pxor	%xmm1,%xmm0
 
 	subq	$0x20,%rcx
@@ -518,9 +518,9 @@
 	pshufd	$78,%xmm0,%xmm4
 	pxor	%xmm0,%xmm4
 
-.byte	102,15,58,68,198,0
-.byte	102,15,58,68,206,17
-.byte	102,15,58,68,231,16
+	pclmulqdq	$0x00,%xmm6,%xmm0
+	pclmulqdq	$0x11,%xmm6,%xmm1
+	pclmulqdq	$0x10,%xmm7,%xmm4
 
 	pxor	%xmm3,%xmm0
 	pxor	%xmm5,%xmm1
@@ -560,14 +560,14 @@
 
 L$odd_tail:
 	movdqu	(%rdx),%xmm8
-.byte	102,69,15,56,0,194
+	pshufb	%xmm10,%xmm8
 	pxor	%xmm8,%xmm0
 	movdqa	%xmm0,%xmm1
 	pshufd	$78,%xmm0,%xmm3
 	pxor	%xmm0,%xmm3
-.byte	102,15,58,68,194,0
-.byte	102,15,58,68,202,17
-.byte	102,15,58,68,223,0
+	pclmulqdq	$0x00,%xmm2,%xmm0
+	pclmulqdq	$0x11,%xmm2,%xmm1
+	pclmulqdq	$0x00,%xmm7,%xmm3
 	pxor	%xmm0,%xmm3
 	pxor	%xmm1,%xmm3
 
@@ -600,7 +600,7 @@
 	psrlq	$1,%xmm0
 	pxor	%xmm1,%xmm0
 L$done:
-.byte	102,65,15,56,0,194
+	pshufb	%xmm10,%xmm0
 	movdqu	%xmm0,(%rdi)
 	ret
 
diff --git a/gen/bcm/ghash-x86_64-linux.S b/gen/bcm/ghash-x86_64-linux.S
index e00bb9f..f1ffcb8 100644
--- a/gen/bcm/ghash-x86_64-linux.S
+++ b/gen/bcm/ghash-x86_64-linux.S
@@ -38,9 +38,9 @@
 	movdqa	%xmm0,%xmm1
 	pshufd	$78,%xmm0,%xmm3
 	pxor	%xmm0,%xmm3
-.byte	102,15,58,68,194,0
-.byte	102,15,58,68,202,17
-.byte	102,15,58,68,222,0
+	pclmulqdq	$0x00,%xmm2,%xmm0
+	pclmulqdq	$0x11,%xmm2,%xmm1
+	pclmulqdq	$0x00,%xmm6,%xmm3
 	pxor	%xmm0,%xmm3
 	pxor	%xmm1,%xmm3
 
@@ -78,14 +78,14 @@
 	movdqu	%xmm2,0(%rdi)
 	pxor	%xmm0,%xmm4
 	movdqu	%xmm0,16(%rdi)
-.byte	102,15,58,15,227,8
+	palignr	$8,%xmm3,%xmm4
 	movdqu	%xmm4,32(%rdi)
 	movdqa	%xmm0,%xmm1
 	pshufd	$78,%xmm0,%xmm3
 	pxor	%xmm0,%xmm3
-.byte	102,15,58,68,194,0
-.byte	102,15,58,68,202,17
-.byte	102,15,58,68,222,0
+	pclmulqdq	$0x00,%xmm2,%xmm0
+	pclmulqdq	$0x11,%xmm2,%xmm1
+	pclmulqdq	$0x00,%xmm6,%xmm3
 	pxor	%xmm0,%xmm3
 	pxor	%xmm1,%xmm3
 
@@ -121,9 +121,9 @@
 	movdqa	%xmm0,%xmm1
 	pshufd	$78,%xmm0,%xmm3
 	pxor	%xmm0,%xmm3
-.byte	102,15,58,68,194,0
-.byte	102,15,58,68,202,17
-.byte	102,15,58,68,222,0
+	pclmulqdq	$0x00,%xmm2,%xmm0
+	pclmulqdq	$0x11,%xmm2,%xmm1
+	pclmulqdq	$0x00,%xmm6,%xmm3
 	pxor	%xmm0,%xmm3
 	pxor	%xmm1,%xmm3
 
@@ -161,7 +161,7 @@
 	movdqu	%xmm5,48(%rdi)
 	pxor	%xmm0,%xmm4
 	movdqu	%xmm0,64(%rdi)
-.byte	102,15,58,15,227,8
+	palignr	$8,%xmm3,%xmm4
 	movdqu	%xmm4,80(%rdi)
 	ret
 .cfi_endproc	
@@ -179,13 +179,13 @@
 	movdqa	.Lbswap_mask(%rip),%xmm5
 	movdqu	(%rsi),%xmm2
 	movdqu	32(%rsi),%xmm4
-.byte	102,15,56,0,197
+	pshufb	%xmm5,%xmm0
 	movdqa	%xmm0,%xmm1
 	pshufd	$78,%xmm0,%xmm3
 	pxor	%xmm0,%xmm3
-.byte	102,15,58,68,194,0
-.byte	102,15,58,68,202,17
-.byte	102,15,58,68,220,0
+	pclmulqdq	$0x00,%xmm2,%xmm0
+	pclmulqdq	$0x11,%xmm2,%xmm1
+	pclmulqdq	$0x00,%xmm4,%xmm3
 	pxor	%xmm0,%xmm3
 	pxor	%xmm1,%xmm3
 
@@ -217,7 +217,7 @@
 	pxor	%xmm4,%xmm0
 	psrlq	$1,%xmm0
 	pxor	%xmm1,%xmm0
-.byte	102,15,56,0,197
+	pshufb	%xmm5,%xmm0
 	movdqu	%xmm0,(%rdi)
 	ret
 .cfi_endproc	
@@ -236,7 +236,7 @@
 	movdqu	(%rdi),%xmm0
 	movdqu	(%rsi),%xmm2
 	movdqu	32(%rsi),%xmm7
-.byte	102,65,15,56,0,194
+	pshufb	%xmm10,%xmm0
 
 	subq	$0x10,%rcx
 	jz	.Lodd_tail
@@ -255,21 +255,21 @@
 
 	movdqu	48(%rdx),%xmm3
 	movdqu	32(%rdx),%xmm11
-.byte	102,65,15,56,0,218
-.byte	102,69,15,56,0,218
+	pshufb	%xmm10,%xmm3
+	pshufb	%xmm10,%xmm11
 	movdqa	%xmm3,%xmm5
 	pshufd	$78,%xmm3,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,68,218,0
-.byte	102,15,58,68,234,17
-.byte	102,15,58,68,231,0
+	pclmulqdq	$0x00,%xmm2,%xmm3
+	pclmulqdq	$0x11,%xmm2,%xmm5
+	pclmulqdq	$0x00,%xmm7,%xmm4
 
 	movdqa	%xmm11,%xmm13
 	pshufd	$78,%xmm11,%xmm12
 	pxor	%xmm11,%xmm12
-.byte	102,68,15,58,68,222,0
-.byte	102,68,15,58,68,238,17
-.byte	102,68,15,58,68,231,16
+	pclmulqdq	$0x00,%xmm6,%xmm11
+	pclmulqdq	$0x11,%xmm6,%xmm13
+	pclmulqdq	$0x10,%xmm7,%xmm12
 	xorps	%xmm11,%xmm3
 	xorps	%xmm13,%xmm5
 	movups	80(%rsi),%xmm7
@@ -277,18 +277,18 @@
 
 	movdqu	16(%rdx),%xmm11
 	movdqu	0(%rdx),%xmm8
-.byte	102,69,15,56,0,218
-.byte	102,69,15,56,0,194
+	pshufb	%xmm10,%xmm11
+	pshufb	%xmm10,%xmm8
 	movdqa	%xmm11,%xmm13
 	pshufd	$78,%xmm11,%xmm12
 	pxor	%xmm8,%xmm0
 	pxor	%xmm11,%xmm12
-.byte	102,69,15,58,68,222,0
+	pclmulqdq	$0x00,%xmm14,%xmm11
 	movdqa	%xmm0,%xmm1
 	pshufd	$78,%xmm0,%xmm8
 	pxor	%xmm0,%xmm8
-.byte	102,69,15,58,68,238,17
-.byte	102,68,15,58,68,231,0
+	pclmulqdq	$0x11,%xmm14,%xmm13
+	pclmulqdq	$0x00,%xmm7,%xmm12
 	xorps	%xmm11,%xmm3
 	xorps	%xmm13,%xmm5
 
@@ -299,22 +299,22 @@
 	jmp	.Lmod4_loop
 .align	32
 .Lmod4_loop:
-.byte	102,65,15,58,68,199,0
+	pclmulqdq	$0x00,%xmm15,%xmm0
 	xorps	%xmm12,%xmm4
 	movdqu	48(%rdx),%xmm11
-.byte	102,69,15,56,0,218
-.byte	102,65,15,58,68,207,17
+	pshufb	%xmm10,%xmm11
+	pclmulqdq	$0x11,%xmm15,%xmm1
 	xorps	%xmm3,%xmm0
 	movdqu	32(%rdx),%xmm3
 	movdqa	%xmm11,%xmm13
-.byte	102,68,15,58,68,199,16
+	pclmulqdq	$0x10,%xmm7,%xmm8
 	pshufd	$78,%xmm11,%xmm12
 	xorps	%xmm5,%xmm1
 	pxor	%xmm11,%xmm12
-.byte	102,65,15,56,0,218
+	pshufb	%xmm10,%xmm3
 	movups	32(%rsi),%xmm7
 	xorps	%xmm4,%xmm8
-.byte	102,68,15,58,68,218,0
+	pclmulqdq	$0x00,%xmm2,%xmm11
 	pshufd	$78,%xmm3,%xmm4
 
 	pxor	%xmm0,%xmm8
@@ -322,22 +322,22 @@
 	pxor	%xmm1,%xmm8
 	pxor	%xmm3,%xmm4
 	movdqa	%xmm8,%xmm9
-.byte	102,68,15,58,68,234,17
+	pclmulqdq	$0x11,%xmm2,%xmm13
 	pslldq	$8,%xmm8
 	psrldq	$8,%xmm9
 	pxor	%xmm8,%xmm0
 	movdqa	.L7_mask(%rip),%xmm8
 	pxor	%xmm9,%xmm1
-.byte	102,76,15,110,200
+	movq	%rax,%xmm9
 
 	pand	%xmm0,%xmm8
-.byte	102,69,15,56,0,200
+	pshufb	%xmm8,%xmm9
 	pxor	%xmm0,%xmm9
-.byte	102,68,15,58,68,231,0
+	pclmulqdq	$0x00,%xmm7,%xmm12
 	psllq	$57,%xmm9
 	movdqa	%xmm9,%xmm8
 	pslldq	$8,%xmm9
-.byte	102,15,58,68,222,0
+	pclmulqdq	$0x00,%xmm6,%xmm3
 	psrldq	$8,%xmm8
 	pxor	%xmm9,%xmm0
 	pxor	%xmm8,%xmm1
@@ -345,14 +345,14 @@
 
 	movdqa	%xmm0,%xmm9
 	psrlq	$1,%xmm0
-.byte	102,15,58,68,238,17
+	pclmulqdq	$0x11,%xmm6,%xmm5
 	xorps	%xmm11,%xmm3
 	movdqu	16(%rdx),%xmm11
-.byte	102,69,15,56,0,218
-.byte	102,15,58,68,231,16
+	pshufb	%xmm10,%xmm11
+	pclmulqdq	$0x10,%xmm7,%xmm4
 	xorps	%xmm13,%xmm5
 	movups	80(%rsi),%xmm7
-.byte	102,69,15,56,0,194
+	pshufb	%xmm10,%xmm8
 	pxor	%xmm9,%xmm1
 	pxor	%xmm0,%xmm9
 	psrlq	$5,%xmm0
@@ -363,16 +363,16 @@
 	pxor	%xmm9,%xmm0
 	pxor	%xmm8,%xmm1
 	pxor	%xmm11,%xmm12
-.byte	102,69,15,58,68,222,0
+	pclmulqdq	$0x00,%xmm14,%xmm11
 	psrlq	$1,%xmm0
 	pxor	%xmm1,%xmm0
 	movdqa	%xmm0,%xmm1
-.byte	102,69,15,58,68,238,17
+	pclmulqdq	$0x11,%xmm14,%xmm13
 	xorps	%xmm11,%xmm3
 	pshufd	$78,%xmm0,%xmm8
 	pxor	%xmm0,%xmm8
 
-.byte	102,68,15,58,68,231,0
+	pclmulqdq	$0x00,%xmm7,%xmm12
 	xorps	%xmm13,%xmm5
 
 	leaq	64(%rdx),%rdx
@@ -380,9 +380,9 @@
 	jnc	.Lmod4_loop
 
 .Ltail4x:
-.byte	102,65,15,58,68,199,0
-.byte	102,65,15,58,68,207,17
-.byte	102,68,15,58,68,199,16
+	pclmulqdq	$0x00,%xmm15,%xmm0
+	pclmulqdq	$0x11,%xmm15,%xmm1
+	pclmulqdq	$0x10,%xmm7,%xmm8
 	xorps	%xmm12,%xmm4
 	xorps	%xmm3,%xmm0
 	xorps	%xmm5,%xmm1
@@ -433,16 +433,16 @@
 
 	movdqu	(%rdx),%xmm8
 	movdqu	16(%rdx),%xmm3
-.byte	102,69,15,56,0,194
-.byte	102,65,15,56,0,218
+	pshufb	%xmm10,%xmm8
+	pshufb	%xmm10,%xmm3
 	pxor	%xmm8,%xmm0
 
 	movdqa	%xmm3,%xmm5
 	pshufd	$78,%xmm3,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,68,218,0
-.byte	102,15,58,68,234,17
-.byte	102,15,58,68,231,0
+	pclmulqdq	$0x00,%xmm2,%xmm3
+	pclmulqdq	$0x11,%xmm2,%xmm5
+	pclmulqdq	$0x00,%xmm7,%xmm4
 
 	leaq	32(%rdx),%rdx
 	nop
@@ -458,21 +458,21 @@
 	pshufd	$78,%xmm0,%xmm4
 	pxor	%xmm0,%xmm4
 
-.byte	102,15,58,68,198,0
-.byte	102,15,58,68,206,17
-.byte	102,15,58,68,231,16
+	pclmulqdq	$0x00,%xmm6,%xmm0
+	pclmulqdq	$0x11,%xmm6,%xmm1
+	pclmulqdq	$0x10,%xmm7,%xmm4
 
 	pxor	%xmm3,%xmm0
 	pxor	%xmm5,%xmm1
 	movdqu	(%rdx),%xmm9
 	pxor	%xmm0,%xmm8
-.byte	102,69,15,56,0,202
+	pshufb	%xmm10,%xmm9
 	movdqu	16(%rdx),%xmm3
 
 	pxor	%xmm1,%xmm8
 	pxor	%xmm9,%xmm1
 	pxor	%xmm8,%xmm4
-.byte	102,65,15,56,0,218
+	pshufb	%xmm10,%xmm3
 	movdqa	%xmm4,%xmm8
 	psrldq	$8,%xmm8
 	pslldq	$8,%xmm4
@@ -485,7 +485,7 @@
 	movdqa	%xmm0,%xmm8
 	psllq	$5,%xmm0
 	pxor	%xmm0,%xmm8
-.byte	102,15,58,68,218,0
+	pclmulqdq	$0x00,%xmm2,%xmm3
 	psllq	$1,%xmm0
 	pxor	%xmm8,%xmm0
 	psllq	$57,%xmm0
@@ -499,14 +499,14 @@
 
 	movdqa	%xmm0,%xmm9
 	psrlq	$1,%xmm0
-.byte	102,15,58,68,234,17
+	pclmulqdq	$0x11,%xmm2,%xmm5
 	pxor	%xmm9,%xmm1
 	pxor	%xmm0,%xmm9
 	psrlq	$5,%xmm0
 	pxor	%xmm9,%xmm0
 	leaq	32(%rdx),%rdx
 	psrlq	$1,%xmm0
-.byte	102,15,58,68,231,0
+	pclmulqdq	$0x00,%xmm7,%xmm4
 	pxor	%xmm1,%xmm0
 
 	subq	$0x20,%rcx
@@ -518,9 +518,9 @@
 	pshufd	$78,%xmm0,%xmm4
 	pxor	%xmm0,%xmm4
 
-.byte	102,15,58,68,198,0
-.byte	102,15,58,68,206,17
-.byte	102,15,58,68,231,16
+	pclmulqdq	$0x00,%xmm6,%xmm0
+	pclmulqdq	$0x11,%xmm6,%xmm1
+	pclmulqdq	$0x10,%xmm7,%xmm4
 
 	pxor	%xmm3,%xmm0
 	pxor	%xmm5,%xmm1
@@ -560,14 +560,14 @@
 
 .Lodd_tail:
 	movdqu	(%rdx),%xmm8
-.byte	102,69,15,56,0,194
+	pshufb	%xmm10,%xmm8
 	pxor	%xmm8,%xmm0
 	movdqa	%xmm0,%xmm1
 	pshufd	$78,%xmm0,%xmm3
 	pxor	%xmm0,%xmm3
-.byte	102,15,58,68,194,0
-.byte	102,15,58,68,202,17
-.byte	102,15,58,68,223,0
+	pclmulqdq	$0x00,%xmm2,%xmm0
+	pclmulqdq	$0x11,%xmm2,%xmm1
+	pclmulqdq	$0x00,%xmm7,%xmm3
 	pxor	%xmm0,%xmm3
 	pxor	%xmm1,%xmm3
 
@@ -600,7 +600,7 @@
 	psrlq	$1,%xmm0
 	pxor	%xmm1,%xmm0
 .Ldone:
-.byte	102,65,15,56,0,194
+	pshufb	%xmm10,%xmm0
 	movdqu	%xmm0,(%rdi)
 	ret
 .cfi_endproc	
diff --git a/gen/bcm/ghash-x86_64-win.asm b/gen/bcm/ghash-x86_64-win.asm
index b5416b3..5cfb844 100644
--- a/gen/bcm/ghash-x86_64-win.asm
+++ b/gen/bcm/ghash-x86_64-win.asm
@@ -50,9 +50,9 @@
 	movdqa	xmm1,xmm0
 	pshufd	xmm3,xmm0,78
 	pxor	xmm3,xmm0
-DB	102,15,58,68,194,0
-DB	102,15,58,68,202,17
-DB	102,15,58,68,222,0
+	pclmulqdq	xmm0,xmm2,0x00
+	pclmulqdq	xmm1,xmm2,0x11
+	pclmulqdq	xmm3,xmm6,0x00
 	pxor	xmm3,xmm0
 	pxor	xmm3,xmm1
 
@@ -90,14 +90,14 @@
 	movdqu	XMMWORD[rcx],xmm2
 	pxor	xmm4,xmm0
 	movdqu	XMMWORD[16+rcx],xmm0
-DB	102,15,58,15,227,8
+	palignr	xmm4,xmm3,8
 	movdqu	XMMWORD[32+rcx],xmm4
 	movdqa	xmm1,xmm0
 	pshufd	xmm3,xmm0,78
 	pxor	xmm3,xmm0
-DB	102,15,58,68,194,0
-DB	102,15,58,68,202,17
-DB	102,15,58,68,222,0
+	pclmulqdq	xmm0,xmm2,0x00
+	pclmulqdq	xmm1,xmm2,0x11
+	pclmulqdq	xmm3,xmm6,0x00
 	pxor	xmm3,xmm0
 	pxor	xmm3,xmm1
 
@@ -133,9 +133,9 @@
 	movdqa	xmm1,xmm0
 	pshufd	xmm3,xmm0,78
 	pxor	xmm3,xmm0
-DB	102,15,58,68,194,0
-DB	102,15,58,68,202,17
-DB	102,15,58,68,222,0
+	pclmulqdq	xmm0,xmm2,0x00
+	pclmulqdq	xmm1,xmm2,0x11
+	pclmulqdq	xmm3,xmm6,0x00
 	pxor	xmm3,xmm0
 	pxor	xmm3,xmm1
 
@@ -173,7 +173,7 @@
 	movdqu	XMMWORD[48+rcx],xmm5
 	pxor	xmm4,xmm0
 	movdqu	XMMWORD[64+rcx],xmm0
-DB	102,15,58,15,227,8
+	palignr	xmm4,xmm3,8
 	movdqu	XMMWORD[80+rcx],xmm4
 	movaps	xmm6,XMMWORD[rsp]
 	lea	rsp,[24+rsp]
@@ -192,13 +192,13 @@
 	movdqa	xmm5,XMMWORD[$L$bswap_mask]
 	movdqu	xmm2,XMMWORD[rdx]
 	movdqu	xmm4,XMMWORD[32+rdx]
-DB	102,15,56,0,197
+	pshufb	xmm0,xmm5
 	movdqa	xmm1,xmm0
 	pshufd	xmm3,xmm0,78
 	pxor	xmm3,xmm0
-DB	102,15,58,68,194,0
-DB	102,15,58,68,202,17
-DB	102,15,58,68,220,0
+	pclmulqdq	xmm0,xmm2,0x00
+	pclmulqdq	xmm1,xmm2,0x11
+	pclmulqdq	xmm3,xmm4,0x00
 	pxor	xmm3,xmm0
 	pxor	xmm3,xmm1
 
@@ -230,7 +230,7 @@
 	pxor	xmm0,xmm4
 	psrlq	xmm0,1
 	pxor	xmm0,xmm1
-DB	102,15,56,0,197
+	pshufb	xmm0,xmm5
 	movdqu	XMMWORD[rcx],xmm0
 	ret
 
@@ -272,7 +272,7 @@
 	movdqu	xmm0,XMMWORD[rcx]
 	movdqu	xmm2,XMMWORD[rdx]
 	movdqu	xmm7,XMMWORD[32+rdx]
-DB	102,65,15,56,0,194
+	pshufb	xmm0,xmm10
 
 	sub	r9,0x10
 	jz	NEAR $L$odd_tail
@@ -291,21 +291,21 @@
 
 	movdqu	xmm3,XMMWORD[48+r8]
 	movdqu	xmm11,XMMWORD[32+r8]
-DB	102,65,15,56,0,218
-DB	102,69,15,56,0,218
+	pshufb	xmm3,xmm10
+	pshufb	xmm11,xmm10
 	movdqa	xmm5,xmm3
 	pshufd	xmm4,xmm3,78
 	pxor	xmm4,xmm3
-DB	102,15,58,68,218,0
-DB	102,15,58,68,234,17
-DB	102,15,58,68,231,0
+	pclmulqdq	xmm3,xmm2,0x00
+	pclmulqdq	xmm5,xmm2,0x11
+	pclmulqdq	xmm4,xmm7,0x00
 
 	movdqa	xmm13,xmm11
 	pshufd	xmm12,xmm11,78
 	pxor	xmm12,xmm11
-DB	102,68,15,58,68,222,0
-DB	102,68,15,58,68,238,17
-DB	102,68,15,58,68,231,16
+	pclmulqdq	xmm11,xmm6,0x00
+	pclmulqdq	xmm13,xmm6,0x11
+	pclmulqdq	xmm12,xmm7,0x10
 	xorps	xmm3,xmm11
 	xorps	xmm5,xmm13
 	movups	xmm7,XMMWORD[80+rdx]
@@ -313,18 +313,18 @@
 
 	movdqu	xmm11,XMMWORD[16+r8]
 	movdqu	xmm8,XMMWORD[r8]
-DB	102,69,15,56,0,218
-DB	102,69,15,56,0,194
+	pshufb	xmm11,xmm10
+	pshufb	xmm8,xmm10
 	movdqa	xmm13,xmm11
 	pshufd	xmm12,xmm11,78
 	pxor	xmm0,xmm8
 	pxor	xmm12,xmm11
-DB	102,69,15,58,68,222,0
+	pclmulqdq	xmm11,xmm14,0x00
 	movdqa	xmm1,xmm0
 	pshufd	xmm8,xmm0,78
 	pxor	xmm8,xmm0
-DB	102,69,15,58,68,238,17
-DB	102,68,15,58,68,231,0
+	pclmulqdq	xmm13,xmm14,0x11
+	pclmulqdq	xmm12,xmm7,0x00
 	xorps	xmm3,xmm11
 	xorps	xmm5,xmm13
 
@@ -335,22 +335,22 @@
 	jmp	NEAR $L$mod4_loop
 ALIGN	32
 $L$mod4_loop:
-DB	102,65,15,58,68,199,0
+	pclmulqdq	xmm0,xmm15,0x00
 	xorps	xmm4,xmm12
 	movdqu	xmm11,XMMWORD[48+r8]
-DB	102,69,15,56,0,218
-DB	102,65,15,58,68,207,17
+	pshufb	xmm11,xmm10
+	pclmulqdq	xmm1,xmm15,0x11
 	xorps	xmm0,xmm3
 	movdqu	xmm3,XMMWORD[32+r8]
 	movdqa	xmm13,xmm11
-DB	102,68,15,58,68,199,16
+	pclmulqdq	xmm8,xmm7,0x10
 	pshufd	xmm12,xmm11,78
 	xorps	xmm1,xmm5
 	pxor	xmm12,xmm11
-DB	102,65,15,56,0,218
+	pshufb	xmm3,xmm10
 	movups	xmm7,XMMWORD[32+rdx]
 	xorps	xmm8,xmm4
-DB	102,68,15,58,68,218,0
+	pclmulqdq	xmm11,xmm2,0x00
 	pshufd	xmm4,xmm3,78
 
 	pxor	xmm8,xmm0
@@ -358,22 +358,22 @@
 	pxor	xmm8,xmm1
 	pxor	xmm4,xmm3
 	movdqa	xmm9,xmm8
-DB	102,68,15,58,68,234,17
+	pclmulqdq	xmm13,xmm2,0x11
 	pslldq	xmm8,8
 	psrldq	xmm9,8
 	pxor	xmm0,xmm8
 	movdqa	xmm8,XMMWORD[$L$7_mask]
 	pxor	xmm1,xmm9
-DB	102,76,15,110,200
+	movq	xmm9,rax
 
 	pand	xmm8,xmm0
-DB	102,69,15,56,0,200
+	pshufb	xmm9,xmm8
 	pxor	xmm9,xmm0
-DB	102,68,15,58,68,231,0
+	pclmulqdq	xmm12,xmm7,0x00
 	psllq	xmm9,57
 	movdqa	xmm8,xmm9
 	pslldq	xmm9,8
-DB	102,15,58,68,222,0
+	pclmulqdq	xmm3,xmm6,0x00
 	psrldq	xmm8,8
 	pxor	xmm0,xmm9
 	pxor	xmm1,xmm8
@@ -381,14 +381,14 @@
 
 	movdqa	xmm9,xmm0
 	psrlq	xmm0,1
-DB	102,15,58,68,238,17
+	pclmulqdq	xmm5,xmm6,0x11
 	xorps	xmm3,xmm11
 	movdqu	xmm11,XMMWORD[16+r8]
-DB	102,69,15,56,0,218
-DB	102,15,58,68,231,16
+	pshufb	xmm11,xmm10
+	pclmulqdq	xmm4,xmm7,0x10
 	xorps	xmm5,xmm13
 	movups	xmm7,XMMWORD[80+rdx]
-DB	102,69,15,56,0,194
+	pshufb	xmm8,xmm10
 	pxor	xmm1,xmm9
 	pxor	xmm9,xmm0
 	psrlq	xmm0,5
@@ -399,16 +399,16 @@
 	pxor	xmm0,xmm9
 	pxor	xmm1,xmm8
 	pxor	xmm12,xmm11
-DB	102,69,15,58,68,222,0
+	pclmulqdq	xmm11,xmm14,0x00
 	psrlq	xmm0,1
 	pxor	xmm0,xmm1
 	movdqa	xmm1,xmm0
-DB	102,69,15,58,68,238,17
+	pclmulqdq	xmm13,xmm14,0x11
 	xorps	xmm3,xmm11
 	pshufd	xmm8,xmm0,78
 	pxor	xmm8,xmm0
 
-DB	102,68,15,58,68,231,0
+	pclmulqdq	xmm12,xmm7,0x00
 	xorps	xmm5,xmm13
 
 	lea	r8,[64+r8]
@@ -416,9 +416,9 @@
 	jnc	NEAR $L$mod4_loop
 
 $L$tail4x:
-DB	102,65,15,58,68,199,0
-DB	102,65,15,58,68,207,17
-DB	102,68,15,58,68,199,16
+	pclmulqdq	xmm0,xmm15,0x00
+	pclmulqdq	xmm1,xmm15,0x11
+	pclmulqdq	xmm8,xmm7,0x10
 	xorps	xmm4,xmm12
 	xorps	xmm0,xmm3
 	xorps	xmm1,xmm5
@@ -469,16 +469,16 @@
 
 	movdqu	xmm8,XMMWORD[r8]
 	movdqu	xmm3,XMMWORD[16+r8]
-DB	102,69,15,56,0,194
-DB	102,65,15,56,0,218
+	pshufb	xmm8,xmm10
+	pshufb	xmm3,xmm10
 	pxor	xmm0,xmm8
 
 	movdqa	xmm5,xmm3
 	pshufd	xmm4,xmm3,78
 	pxor	xmm4,xmm3
-DB	102,15,58,68,218,0
-DB	102,15,58,68,234,17
-DB	102,15,58,68,231,0
+	pclmulqdq	xmm3,xmm2,0x00
+	pclmulqdq	xmm5,xmm2,0x11
+	pclmulqdq	xmm4,xmm7,0x00
 
 	lea	r8,[32+r8]
 	nop
@@ -494,21 +494,21 @@
 	pshufd	xmm4,xmm0,78
 	pxor	xmm4,xmm0
 
-DB	102,15,58,68,198,0
-DB	102,15,58,68,206,17
-DB	102,15,58,68,231,16
+	pclmulqdq	xmm0,xmm6,0x00
+	pclmulqdq	xmm1,xmm6,0x11
+	pclmulqdq	xmm4,xmm7,0x10
 
 	pxor	xmm0,xmm3
 	pxor	xmm1,xmm5
 	movdqu	xmm9,XMMWORD[r8]
 	pxor	xmm8,xmm0
-DB	102,69,15,56,0,202
+	pshufb	xmm9,xmm10
 	movdqu	xmm3,XMMWORD[16+r8]
 
 	pxor	xmm8,xmm1
 	pxor	xmm1,xmm9
 	pxor	xmm4,xmm8
-DB	102,65,15,56,0,218
+	pshufb	xmm3,xmm10
 	movdqa	xmm8,xmm4
 	psrldq	xmm8,8
 	pslldq	xmm4,8
@@ -521,7 +521,7 @@
 	movdqa	xmm8,xmm0
 	psllq	xmm0,5
 	pxor	xmm8,xmm0
-DB	102,15,58,68,218,0
+	pclmulqdq	xmm3,xmm2,0x00
 	psllq	xmm0,1
 	pxor	xmm0,xmm8
 	psllq	xmm0,57
@@ -535,14 +535,14 @@
 
 	movdqa	xmm9,xmm0
 	psrlq	xmm0,1
-DB	102,15,58,68,234,17
+	pclmulqdq	xmm5,xmm2,0x11
 	pxor	xmm1,xmm9
 	pxor	xmm9,xmm0
 	psrlq	xmm0,5
 	pxor	xmm0,xmm9
 	lea	r8,[32+r8]
 	psrlq	xmm0,1
-DB	102,15,58,68,231,0
+	pclmulqdq	xmm4,xmm7,0x00
 	pxor	xmm0,xmm1
 
 	sub	r9,0x20
@@ -554,9 +554,9 @@
 	pshufd	xmm4,xmm0,78
 	pxor	xmm4,xmm0
 
-DB	102,15,58,68,198,0
-DB	102,15,58,68,206,17
-DB	102,15,58,68,231,16
+	pclmulqdq	xmm0,xmm6,0x00
+	pclmulqdq	xmm1,xmm6,0x11
+	pclmulqdq	xmm4,xmm7,0x10
 
 	pxor	xmm0,xmm3
 	pxor	xmm1,xmm5
@@ -596,14 +596,14 @@
 
 $L$odd_tail:
 	movdqu	xmm8,XMMWORD[r8]
-DB	102,69,15,56,0,194
+	pshufb	xmm8,xmm10
 	pxor	xmm0,xmm8
 	movdqa	xmm1,xmm0
 	pshufd	xmm3,xmm0,78
 	pxor	xmm3,xmm0
-DB	102,15,58,68,194,0
-DB	102,15,58,68,202,17
-DB	102,15,58,68,223,0
+	pclmulqdq	xmm0,xmm2,0x00
+	pclmulqdq	xmm1,xmm2,0x11
+	pclmulqdq	xmm3,xmm7,0x00
 	pxor	xmm3,xmm0
 	pxor	xmm3,xmm1
 
@@ -636,7 +636,7 @@
 	psrlq	xmm0,1
 	pxor	xmm0,xmm1
 $L$done:
-DB	102,65,15,56,0,194
+	pshufb	xmm0,xmm10
 	movdqu	XMMWORD[rcx],xmm0
 	movaps	xmm6,XMMWORD[rsp]
 	movaps	xmm7,XMMWORD[16+rsp]
diff --git a/gen/bcm/p256-x86_64-asm-apple.S b/gen/bcm/p256-x86_64-asm-apple.S
index d43fcfc..80ffa01 100644
--- a/gen/bcm/p256-x86_64-asm-apple.S
+++ b/gen/bcm/p256-x86_64-asm-apple.S
@@ -450,21 +450,21 @@
 	movq	%rax,%rbp
 	mulq	%r8
 	movq	%rax,%r9
-.byte	102,72,15,110,205
+	movq	%rbp,%xmm1
 	movq	%r14,%rax
 	movq	%rdx,%r10
 
 	mulq	%r8
 	addq	%rax,%r10
 	movq	%r15,%rax
-.byte	102,73,15,110,214
+	movq	%r14,%xmm2
 	adcq	$0,%rdx
 	movq	%rdx,%r11
 
 	mulq	%r8
 	addq	%rax,%r11
 	movq	%r15,%rax
-.byte	102,73,15,110,223
+	movq	%r15,%xmm3
 	adcq	$0,%rdx
 	movq	%rdx,%r12
 
@@ -503,20 +503,20 @@
 
 	mulq	%rax
 	movq	%rax,%r8
-.byte	102,72,15,126,200
+	movq	%xmm1,%rax
 	movq	%rdx,%rbp
 
 	mulq	%rax
 	addq	%rbp,%r9
 	adcq	%rax,%r10
-.byte	102,72,15,126,208
+	movq	%xmm2,%rax
 	adcq	$0,%rdx
 	movq	%rdx,%rbp
 
 	mulq	%rax
 	addq	%rbp,%r11
 	adcq	%rax,%r12
-.byte	102,72,15,126,216
+	movq	%xmm3,%rax
 	adcq	$0,%rdx
 	movq	%rdx,%rbp
 
@@ -977,11 +977,11 @@
 	mulxq	%r14,%r9,%r10
 	mulxq	%r15,%rcx,%r11
 	movq	%rdx,%rax
-.byte	102,73,15,110,206
+	movq	%r14,%xmm1
 	mulxq	%r8,%rbp,%r12
 	movq	%r14,%rdx
 	addq	%rcx,%r10
-.byte	102,73,15,110,215
+	movq	%r15,%xmm2
 	adcq	%rbp,%r11
 	adcq	$0,%r12
 	xorq	%r13,%r13
@@ -998,7 +998,7 @@
 
 	mulxq	%r8,%rcx,%r14
 	movq	%rax,%rdx
-.byte	102,73,15,110,216
+	movq	%r8,%xmm3
 	xorq	%r15,%r15
 	adcxq	%r9,%r9
 	adoxq	%rcx,%r13
@@ -1007,18 +1007,18 @@
 
 
 	mulxq	%rdx,%r8,%rbp
-.byte	102,72,15,126,202
+	movq	%xmm1,%rdx
 	adcxq	%r11,%r11
 	adoxq	%rbp,%r9
 	adcxq	%r12,%r12
 	mulxq	%rdx,%rcx,%rax
-.byte	102,72,15,126,210
+	movq	%xmm2,%rdx
 	adcxq	%r13,%r13
 	adoxq	%rcx,%r10
 	adcxq	%r14,%r14
 	mulxq	%rdx,%rcx,%rbp
 .byte	0x67
-.byte	102,72,15,126,218
+	movq	%xmm3,%rdx
 	adoxq	%rax,%r11
 	adcxq	%r15,%r15
 	adoxq	%rcx,%r12
@@ -2462,9 +2462,9 @@
 	movdqa	%xmm1,96+16(%rsp)
 	leaq	32(%rdi),%r10
 	leaq	64(%rdi),%r11
-.byte	102,72,15,110,199
-.byte	102,73,15,110,202
-.byte	102,73,15,110,211
+	movq	%rdi,%xmm0
+	movq	%r10,%xmm1
+	movq	%r11,%xmm2
 
 	leaq	0(%rsp),%rdi
 	call	__ecp_nistz256_mul_by_2q
@@ -2492,7 +2492,7 @@
 	movq	64+24(%rbx),%r12
 	leaq	64-0(%rbx),%rsi
 	leaq	32(%rbx),%rbx
-.byte	102,72,15,126,215
+	movq	%xmm2,%rdi
 	call	__ecp_nistz256_mul_montq
 	call	__ecp_nistz256_mul_by_2q
 
@@ -2517,7 +2517,7 @@
 	leaq	0+0(%rsp),%rsi
 	movq	16+0(%rsp),%r15
 	movq	24+0(%rsp),%r8
-.byte	102,72,15,126,207
+	movq	%xmm1,%rdi
 	call	__ecp_nistz256_sqr_montq
 	xorq	%r9,%r9
 	movq	%r12,%rax
@@ -2592,7 +2592,7 @@
 	leaq	0+32(%rsp),%rsi
 	movq	16+32(%rsp),%r15
 	movq	24+32(%rsp),%r8
-.byte	102,72,15,126,199
+	movq	%xmm0,%rdi
 	call	__ecp_nistz256_sqr_montq
 
 	leaq	128(%rsp),%rbx
@@ -2625,8 +2625,8 @@
 	leaq	0(%rsp),%rdi
 	call	__ecp_nistz256_mul_montq
 
-.byte	102,72,15,126,203
-.byte	102,72,15,126,207
+	movq	%xmm1,%rbx
+	movq	%xmm1,%rdi
 	call	__ecp_nistz256_sub_fromq
 
 	leaq	160+56(%rsp),%rsi
@@ -2708,7 +2708,7 @@
 	por	%xmm4,%xmm5
 	pxor	%xmm4,%xmm4
 	por	%xmm0,%xmm1
-.byte	102,72,15,110,199
+	movq	%rdi,%xmm0
 
 	leaq	64-0(%rsi),%rsi
 	movq	%rax,544+0(%rsp)
@@ -2731,7 +2731,7 @@
 	movq	64+8(%rbx),%r14
 	movq	64+16(%rbx),%r15
 	movq	64+24(%rbx),%r8
-.byte	102,72,15,110,203
+	movq	%rbx,%xmm1
 
 	leaq	64-0(%rbx),%rsi
 	leaq	32(%rsp),%rdi
@@ -2786,7 +2786,7 @@
 	orq	%r8,%r12
 	orq	%r9,%r12
 	por	%xmm5,%xmm2
-.byte	102,73,15,110,220
+	movq	%r12,%xmm3
 
 	movq	384(%rsp),%rax
 	leaq	384(%rsp),%rbx
@@ -2816,8 +2816,8 @@
 	orq	%r8,%r12
 	orq	%r9,%r12
 
-.byte	102,73,15,126,208
-.byte	102,73,15,126,217
+	movq	%xmm2,%r8
+	movq	%xmm3,%r9
 	orq	%r8,%r12
 .byte	0x3e
 	jnz	L$add_proceedq
@@ -2832,7 +2832,7 @@
 
 
 
-.byte	102,72,15,126,199
+	movq	%xmm0,%rdi
 	pxor	%xmm0,%xmm0
 	movdqu	%xmm0,0(%rdi)
 	movdqu	%xmm0,16(%rdi)
@@ -2844,8 +2844,8 @@
 
 .p2align	5
 L$add_doubleq:
-.byte	102,72,15,126,206
-.byte	102,72,15,126,199
+	movq	%xmm1,%rsi
+	movq	%xmm0,%rdi
 	addq	$416,%rsp
 
 	jmp	L$point_double_shortcutq
@@ -2981,7 +2981,7 @@
 	leaq	320(%rsp),%rdi
 	call	__ecp_nistz256_sub_fromq
 
-.byte	102,72,15,126,199
+	movq	%xmm0,%rdi
 
 	movdqa	%xmm5,%xmm0
 	movdqa	%xmm5,%xmm1
@@ -3128,7 +3128,7 @@
 	pshufd	$0x1e,%xmm5,%xmm4
 	movdqa	%xmm1,416+16(%rsp)
 	por	%xmm0,%xmm1
-.byte	102,72,15,110,199
+	movq	%rdi,%xmm0
 	movdqa	%xmm2,448(%rsp)
 	movdqa	%xmm3,448+16(%rsp)
 	por	%xmm2,%xmm3
@@ -3306,7 +3306,7 @@
 	leaq	256(%rsp),%rdi
 	call	__ecp_nistz256_sub_fromq
 
-.byte	102,72,15,126,199
+	movq	%xmm0,%rdi
 
 	movdqa	%xmm5,%xmm0
 	movdqa	%xmm5,%xmm1
@@ -3572,9 +3572,9 @@
 	movdqa	%xmm1,96+16(%rsp)
 	leaq	32(%rdi),%r10
 	leaq	64(%rdi),%r11
-.byte	102,72,15,110,199
-.byte	102,73,15,110,202
-.byte	102,73,15,110,211
+	movq	%rdi,%xmm0
+	movq	%r10,%xmm1
+	movq	%r11,%xmm2
 
 	leaq	0(%rsp),%rdi
 	call	__ecp_nistz256_mul_by_2x
@@ -3602,7 +3602,7 @@
 	movq	64+24(%rbx),%r12
 	leaq	64-128(%rbx),%rsi
 	leaq	32(%rbx),%rbx
-.byte	102,72,15,126,215
+	movq	%xmm2,%rdi
 	call	__ecp_nistz256_mul_montx
 	call	__ecp_nistz256_mul_by_2x
 
@@ -3627,7 +3627,7 @@
 	leaq	-128+0(%rsp),%rsi
 	movq	16+0(%rsp),%r15
 	movq	24+0(%rsp),%r8
-.byte	102,72,15,126,207
+	movq	%xmm1,%rdi
 	call	__ecp_nistz256_sqr_montx
 	xorq	%r9,%r9
 	movq	%r12,%rax
@@ -3702,7 +3702,7 @@
 	leaq	-128+32(%rsp),%rsi
 	movq	16+32(%rsp),%r15
 	movq	24+32(%rsp),%r8
-.byte	102,72,15,126,199
+	movq	%xmm0,%rdi
 	call	__ecp_nistz256_sqr_montx
 
 	leaq	128(%rsp),%rbx
@@ -3735,8 +3735,8 @@
 	leaq	0(%rsp),%rdi
 	call	__ecp_nistz256_mul_montx
 
-.byte	102,72,15,126,203
-.byte	102,72,15,126,207
+	movq	%xmm1,%rbx
+	movq	%xmm1,%rdi
 	call	__ecp_nistz256_sub_fromx
 
 	leaq	160+56(%rsp),%rsi
@@ -3818,7 +3818,7 @@
 	por	%xmm4,%xmm5
 	pxor	%xmm4,%xmm4
 	por	%xmm0,%xmm1
-.byte	102,72,15,110,199
+	movq	%rdi,%xmm0
 
 	leaq	64-128(%rsi),%rsi
 	movq	%rdx,544+0(%rsp)
@@ -3841,7 +3841,7 @@
 	movq	64+8(%rbx),%r14
 	movq	64+16(%rbx),%r15
 	movq	64+24(%rbx),%r8
-.byte	102,72,15,110,203
+	movq	%rbx,%xmm1
 
 	leaq	64-128(%rbx),%rsi
 	leaq	32(%rsp),%rdi
@@ -3896,7 +3896,7 @@
 	orq	%r8,%r12
 	orq	%r9,%r12
 	por	%xmm5,%xmm2
-.byte	102,73,15,110,220
+	movq	%r12,%xmm3
 
 	movq	384(%rsp),%rdx
 	leaq	384(%rsp),%rbx
@@ -3926,8 +3926,8 @@
 	orq	%r8,%r12
 	orq	%r9,%r12
 
-.byte	102,73,15,126,208
-.byte	102,73,15,126,217
+	movq	%xmm2,%r8
+	movq	%xmm3,%r9
 	orq	%r8,%r12
 .byte	0x3e
 	jnz	L$add_proceedx
@@ -3942,7 +3942,7 @@
 
 
 
-.byte	102,72,15,126,199
+	movq	%xmm0,%rdi
 	pxor	%xmm0,%xmm0
 	movdqu	%xmm0,0(%rdi)
 	movdqu	%xmm0,16(%rdi)
@@ -3954,8 +3954,8 @@
 
 .p2align	5
 L$add_doublex:
-.byte	102,72,15,126,206
-.byte	102,72,15,126,199
+	movq	%xmm1,%rsi
+	movq	%xmm0,%rdi
 	addq	$416,%rsp
 
 	jmp	L$point_double_shortcutx
@@ -4091,7 +4091,7 @@
 	leaq	320(%rsp),%rdi
 	call	__ecp_nistz256_sub_fromx
 
-.byte	102,72,15,126,199
+	movq	%xmm0,%rdi
 
 	movdqa	%xmm5,%xmm0
 	movdqa	%xmm5,%xmm1
@@ -4238,7 +4238,7 @@
 	pshufd	$0x1e,%xmm5,%xmm4
 	movdqa	%xmm1,416+16(%rsp)
 	por	%xmm0,%xmm1
-.byte	102,72,15,110,199
+	movq	%rdi,%xmm0
 	movdqa	%xmm2,448(%rsp)
 	movdqa	%xmm3,448+16(%rsp)
 	por	%xmm2,%xmm3
@@ -4416,7 +4416,7 @@
 	leaq	256(%rsp),%rdi
 	call	__ecp_nistz256_sub_fromx
 
-.byte	102,72,15,126,199
+	movq	%xmm0,%rdi
 
 	movdqa	%xmm5,%xmm0
 	movdqa	%xmm5,%xmm1
diff --git a/gen/bcm/p256-x86_64-asm-linux.S b/gen/bcm/p256-x86_64-asm-linux.S
index b14ef80..c54d577 100644
--- a/gen/bcm/p256-x86_64-asm-linux.S
+++ b/gen/bcm/p256-x86_64-asm-linux.S
@@ -464,21 +464,21 @@
 	movq	%rax,%rbp
 	mulq	%r8
 	movq	%rax,%r9
-.byte	102,72,15,110,205
+	movq	%rbp,%xmm1
 	movq	%r14,%rax
 	movq	%rdx,%r10
 
 	mulq	%r8
 	addq	%rax,%r10
 	movq	%r15,%rax
-.byte	102,73,15,110,214
+	movq	%r14,%xmm2
 	adcq	$0,%rdx
 	movq	%rdx,%r11
 
 	mulq	%r8
 	addq	%rax,%r11
 	movq	%r15,%rax
-.byte	102,73,15,110,223
+	movq	%r15,%xmm3
 	adcq	$0,%rdx
 	movq	%rdx,%r12
 
@@ -517,20 +517,20 @@
 
 	mulq	%rax
 	movq	%rax,%r8
-.byte	102,72,15,126,200
+	movq	%xmm1,%rax
 	movq	%rdx,%rbp
 
 	mulq	%rax
 	addq	%rbp,%r9
 	adcq	%rax,%r10
-.byte	102,72,15,126,208
+	movq	%xmm2,%rax
 	adcq	$0,%rdx
 	movq	%rdx,%rbp
 
 	mulq	%rax
 	addq	%rbp,%r11
 	adcq	%rax,%r12
-.byte	102,72,15,126,216
+	movq	%xmm3,%rax
 	adcq	$0,%rdx
 	movq	%rdx,%rbp
 
@@ -1003,11 +1003,11 @@
 	mulxq	%r14,%r9,%r10
 	mulxq	%r15,%rcx,%r11
 	movq	%rdx,%rax
-.byte	102,73,15,110,206
+	movq	%r14,%xmm1
 	mulxq	%r8,%rbp,%r12
 	movq	%r14,%rdx
 	addq	%rcx,%r10
-.byte	102,73,15,110,215
+	movq	%r15,%xmm2
 	adcq	%rbp,%r11
 	adcq	$0,%r12
 	xorq	%r13,%r13
@@ -1024,7 +1024,7 @@
 
 	mulxq	%r8,%rcx,%r14
 	movq	%rax,%rdx
-.byte	102,73,15,110,216
+	movq	%r8,%xmm3
 	xorq	%r15,%r15
 	adcxq	%r9,%r9
 	adoxq	%rcx,%r13
@@ -1033,18 +1033,18 @@
 
 
 	mulxq	%rdx,%r8,%rbp
-.byte	102,72,15,126,202
+	movq	%xmm1,%rdx
 	adcxq	%r11,%r11
 	adoxq	%rbp,%r9
 	adcxq	%r12,%r12
 	mulxq	%rdx,%rcx,%rax
-.byte	102,72,15,126,210
+	movq	%xmm2,%rdx
 	adcxq	%r13,%r13
 	adoxq	%rcx,%r10
 	adcxq	%r14,%r14
 	mulxq	%rdx,%rcx,%rbp
 .byte	0x67
-.byte	102,72,15,126,218
+	movq	%xmm3,%rdx
 	adoxq	%rax,%r11
 	adcxq	%r15,%r15
 	adoxq	%rcx,%r12
@@ -2518,9 +2518,9 @@
 	movdqa	%xmm1,96+16(%rsp)
 	leaq	32(%rdi),%r10
 	leaq	64(%rdi),%r11
-.byte	102,72,15,110,199
-.byte	102,73,15,110,202
-.byte	102,73,15,110,211
+	movq	%rdi,%xmm0
+	movq	%r10,%xmm1
+	movq	%r11,%xmm2
 
 	leaq	0(%rsp),%rdi
 	call	__ecp_nistz256_mul_by_2q
@@ -2548,7 +2548,7 @@
 	movq	64+24(%rbx),%r12
 	leaq	64-0(%rbx),%rsi
 	leaq	32(%rbx),%rbx
-.byte	102,72,15,126,215
+	movq	%xmm2,%rdi
 	call	__ecp_nistz256_mul_montq
 	call	__ecp_nistz256_mul_by_2q
 
@@ -2573,7 +2573,7 @@
 	leaq	0+0(%rsp),%rsi
 	movq	16+0(%rsp),%r15
 	movq	24+0(%rsp),%r8
-.byte	102,72,15,126,207
+	movq	%xmm1,%rdi
 	call	__ecp_nistz256_sqr_montq
 	xorq	%r9,%r9
 	movq	%r12,%rax
@@ -2648,7 +2648,7 @@
 	leaq	0+32(%rsp),%rsi
 	movq	16+32(%rsp),%r15
 	movq	24+32(%rsp),%r8
-.byte	102,72,15,126,199
+	movq	%xmm0,%rdi
 	call	__ecp_nistz256_sqr_montq
 
 	leaq	128(%rsp),%rbx
@@ -2681,8 +2681,8 @@
 	leaq	0(%rsp),%rdi
 	call	__ecp_nistz256_mul_montq
 
-.byte	102,72,15,126,203
-.byte	102,72,15,126,207
+	movq	%xmm1,%rbx
+	movq	%xmm1,%rdi
 	call	__ecp_nistz256_sub_fromq
 
 	leaq	160+56(%rsp),%rsi
@@ -2770,7 +2770,7 @@
 	por	%xmm4,%xmm5
 	pxor	%xmm4,%xmm4
 	por	%xmm0,%xmm1
-.byte	102,72,15,110,199
+	movq	%rdi,%xmm0
 
 	leaq	64-0(%rsi),%rsi
 	movq	%rax,544+0(%rsp)
@@ -2793,7 +2793,7 @@
 	movq	64+8(%rbx),%r14
 	movq	64+16(%rbx),%r15
 	movq	64+24(%rbx),%r8
-.byte	102,72,15,110,203
+	movq	%rbx,%xmm1
 
 	leaq	64-0(%rbx),%rsi
 	leaq	32(%rsp),%rdi
@@ -2848,7 +2848,7 @@
 	orq	%r8,%r12
 	orq	%r9,%r12
 	por	%xmm5,%xmm2
-.byte	102,73,15,110,220
+	movq	%r12,%xmm3
 
 	movq	384(%rsp),%rax
 	leaq	384(%rsp),%rbx
@@ -2878,8 +2878,8 @@
 	orq	%r8,%r12
 	orq	%r9,%r12
 
-.byte	102,73,15,126,208
-.byte	102,73,15,126,217
+	movq	%xmm2,%r8
+	movq	%xmm3,%r9
 	orq	%r8,%r12
 .byte	0x3e
 	jnz	.Ladd_proceedq
@@ -2894,7 +2894,7 @@
 
 
 
-.byte	102,72,15,126,199
+	movq	%xmm0,%rdi
 	pxor	%xmm0,%xmm0
 	movdqu	%xmm0,0(%rdi)
 	movdqu	%xmm0,16(%rdi)
@@ -2906,8 +2906,8 @@
 
 .align	32
 .Ladd_doubleq:
-.byte	102,72,15,126,206
-.byte	102,72,15,126,199
+	movq	%xmm1,%rsi
+	movq	%xmm0,%rdi
 	addq	$416,%rsp
 .cfi_adjust_cfa_offset	-416
 	jmp	.Lpoint_double_shortcutq
@@ -3043,7 +3043,7 @@
 	leaq	320(%rsp),%rdi
 	call	__ecp_nistz256_sub_fromq
 
-.byte	102,72,15,126,199
+	movq	%xmm0,%rdi
 
 	movdqa	%xmm5,%xmm0
 	movdqa	%xmm5,%xmm1
@@ -3196,7 +3196,7 @@
 	pshufd	$0x1e,%xmm5,%xmm4
 	movdqa	%xmm1,416+16(%rsp)
 	por	%xmm0,%xmm1
-.byte	102,72,15,110,199
+	movq	%rdi,%xmm0
 	movdqa	%xmm2,448(%rsp)
 	movdqa	%xmm3,448+16(%rsp)
 	por	%xmm2,%xmm3
@@ -3374,7 +3374,7 @@
 	leaq	256(%rsp),%rdi
 	call	__ecp_nistz256_sub_fromq
 
-.byte	102,72,15,126,199
+	movq	%xmm0,%rdi
 
 	movdqa	%xmm5,%xmm0
 	movdqa	%xmm5,%xmm1
@@ -3646,9 +3646,9 @@
 	movdqa	%xmm1,96+16(%rsp)
 	leaq	32(%rdi),%r10
 	leaq	64(%rdi),%r11
-.byte	102,72,15,110,199
-.byte	102,73,15,110,202
-.byte	102,73,15,110,211
+	movq	%rdi,%xmm0
+	movq	%r10,%xmm1
+	movq	%r11,%xmm2
 
 	leaq	0(%rsp),%rdi
 	call	__ecp_nistz256_mul_by_2x
@@ -3676,7 +3676,7 @@
 	movq	64+24(%rbx),%r12
 	leaq	64-128(%rbx),%rsi
 	leaq	32(%rbx),%rbx
-.byte	102,72,15,126,215
+	movq	%xmm2,%rdi
 	call	__ecp_nistz256_mul_montx
 	call	__ecp_nistz256_mul_by_2x
 
@@ -3701,7 +3701,7 @@
 	leaq	-128+0(%rsp),%rsi
 	movq	16+0(%rsp),%r15
 	movq	24+0(%rsp),%r8
-.byte	102,72,15,126,207
+	movq	%xmm1,%rdi
 	call	__ecp_nistz256_sqr_montx
 	xorq	%r9,%r9
 	movq	%r12,%rax
@@ -3776,7 +3776,7 @@
 	leaq	-128+32(%rsp),%rsi
 	movq	16+32(%rsp),%r15
 	movq	24+32(%rsp),%r8
-.byte	102,72,15,126,199
+	movq	%xmm0,%rdi
 	call	__ecp_nistz256_sqr_montx
 
 	leaq	128(%rsp),%rbx
@@ -3809,8 +3809,8 @@
 	leaq	0(%rsp),%rdi
 	call	__ecp_nistz256_mul_montx
 
-.byte	102,72,15,126,203
-.byte	102,72,15,126,207
+	movq	%xmm1,%rbx
+	movq	%xmm1,%rdi
 	call	__ecp_nistz256_sub_fromx
 
 	leaq	160+56(%rsp),%rsi
@@ -3898,7 +3898,7 @@
 	por	%xmm4,%xmm5
 	pxor	%xmm4,%xmm4
 	por	%xmm0,%xmm1
-.byte	102,72,15,110,199
+	movq	%rdi,%xmm0
 
 	leaq	64-128(%rsi),%rsi
 	movq	%rdx,544+0(%rsp)
@@ -3921,7 +3921,7 @@
 	movq	64+8(%rbx),%r14
 	movq	64+16(%rbx),%r15
 	movq	64+24(%rbx),%r8
-.byte	102,72,15,110,203
+	movq	%rbx,%xmm1
 
 	leaq	64-128(%rbx),%rsi
 	leaq	32(%rsp),%rdi
@@ -3976,7 +3976,7 @@
 	orq	%r8,%r12
 	orq	%r9,%r12
 	por	%xmm5,%xmm2
-.byte	102,73,15,110,220
+	movq	%r12,%xmm3
 
 	movq	384(%rsp),%rdx
 	leaq	384(%rsp),%rbx
@@ -4006,8 +4006,8 @@
 	orq	%r8,%r12
 	orq	%r9,%r12
 
-.byte	102,73,15,126,208
-.byte	102,73,15,126,217
+	movq	%xmm2,%r8
+	movq	%xmm3,%r9
 	orq	%r8,%r12
 .byte	0x3e
 	jnz	.Ladd_proceedx
@@ -4022,7 +4022,7 @@
 
 
 
-.byte	102,72,15,126,199
+	movq	%xmm0,%rdi
 	pxor	%xmm0,%xmm0
 	movdqu	%xmm0,0(%rdi)
 	movdqu	%xmm0,16(%rdi)
@@ -4034,8 +4034,8 @@
 
 .align	32
 .Ladd_doublex:
-.byte	102,72,15,126,206
-.byte	102,72,15,126,199
+	movq	%xmm1,%rsi
+	movq	%xmm0,%rdi
 	addq	$416,%rsp
 .cfi_adjust_cfa_offset	-416
 	jmp	.Lpoint_double_shortcutx
@@ -4171,7 +4171,7 @@
 	leaq	320(%rsp),%rdi
 	call	__ecp_nistz256_sub_fromx
 
-.byte	102,72,15,126,199
+	movq	%xmm0,%rdi
 
 	movdqa	%xmm5,%xmm0
 	movdqa	%xmm5,%xmm1
@@ -4324,7 +4324,7 @@
 	pshufd	$0x1e,%xmm5,%xmm4
 	movdqa	%xmm1,416+16(%rsp)
 	por	%xmm0,%xmm1
-.byte	102,72,15,110,199
+	movq	%rdi,%xmm0
 	movdqa	%xmm2,448(%rsp)
 	movdqa	%xmm3,448+16(%rsp)
 	por	%xmm2,%xmm3
@@ -4502,7 +4502,7 @@
 	leaq	256(%rsp),%rdi
 	call	__ecp_nistz256_sub_fromx
 
-.byte	102,72,15,126,199
+	movq	%xmm0,%rdi
 
 	movdqa	%xmm5,%xmm0
 	movdqa	%xmm5,%xmm1
diff --git a/gen/bcm/p256-x86_64-asm-win.asm b/gen/bcm/p256-x86_64-asm-win.asm
index 10a1c0d..194df1c 100644
--- a/gen/bcm/p256-x86_64-asm-win.asm
+++ b/gen/bcm/p256-x86_64-asm-win.asm
@@ -486,21 +486,21 @@
 	mov	rbp,rax
 	mul	r8
 	mov	r9,rax
-DB	102,72,15,110,205
+	movq	xmm1,rbp
 	mov	rax,r14
 	mov	r10,rdx
 
 	mul	r8
 	add	r10,rax
 	mov	rax,r15
-DB	102,73,15,110,214
+	movq	xmm2,r14
 	adc	rdx,0
 	mov	r11,rdx
 
 	mul	r8
 	add	r11,rax
 	mov	rax,r15
-DB	102,73,15,110,223
+	movq	xmm3,r15
 	adc	rdx,0
 	mov	r12,rdx
 
@@ -539,20 +539,20 @@
 
 	mul	rax
 	mov	r8,rax
-DB	102,72,15,126,200
+	movq	rax,xmm1
 	mov	rbp,rdx
 
 	mul	rax
 	add	r9,rbp
 	adc	r10,rax
-DB	102,72,15,126,208
+	movq	rax,xmm2
 	adc	rdx,0
 	mov	rbp,rdx
 
 	mul	rax
 	add	r11,rbp
 	adc	r12,rax
-DB	102,72,15,126,216
+	movq	rax,xmm3
 	adc	rdx,0
 	mov	rbp,rdx
 
@@ -1033,11 +1033,11 @@
 	mulx	r10,r9,r14
 	mulx	r11,rcx,r15
 	mov	rax,rdx
-DB	102,73,15,110,206
+	movq	xmm1,r14
 	mulx	r12,rbp,r8
 	mov	rdx,r14
 	add	r10,rcx
-DB	102,73,15,110,215
+	movq	xmm2,r15
 	adc	r11,rbp
 	adc	r12,0
 	xor	r13,r13
@@ -1054,7 +1054,7 @@
 
 	mulx	r14,rcx,r8
 	mov	rdx,rax
-DB	102,73,15,110,216
+	movq	xmm3,r8
 	xor	r15,r15
 	adcx	r9,r9
 	adox	r13,rcx
@@ -1063,18 +1063,18 @@
 
 
 	mulx	rbp,r8,rdx
-DB	102,72,15,126,202
+	movq	rdx,xmm1
 	adcx	r11,r11
 	adox	r9,rbp
 	adcx	r12,r12
 	mulx	rax,rcx,rdx
-DB	102,72,15,126,210
+	movq	rdx,xmm2
 	adcx	r13,r13
 	adox	r10,rcx
 	adcx	r14,r14
 	mulx	rbp,rcx,rdx
 	DB	0x67
-DB	102,72,15,126,218
+	movq	rdx,xmm3
 	adox	r11,rax
 	adcx	r15,r15
 	adox	r12,rcx
@@ -2659,9 +2659,9 @@
 	movdqa	XMMWORD[(96+16)+rsp],xmm1
 	lea	r10,[32+rdi]
 	lea	r11,[64+rdi]
-DB	102,72,15,110,199
-DB	102,73,15,110,202
-DB	102,73,15,110,211
+	movq	xmm0,rdi
+	movq	xmm1,r10
+	movq	xmm2,r11
 
 	lea	rdi,[rsp]
 	call	__ecp_nistz256_mul_by_2q
@@ -2689,7 +2689,7 @@
 	mov	r12,QWORD[((64+24))+rbx]
 	lea	rsi,[((64-0))+rbx]
 	lea	rbx,[32+rbx]
-DB	102,72,15,126,215
+	movq	rdi,xmm2
 	call	__ecp_nistz256_mul_montq
 	call	__ecp_nistz256_mul_by_2q
 
@@ -2714,7 +2714,7 @@
 	lea	rsi,[((0+0))+rsp]
 	mov	r15,QWORD[((16+0))+rsp]
 	mov	r8,QWORD[((24+0))+rsp]
-DB	102,72,15,126,207
+	movq	rdi,xmm1
 	call	__ecp_nistz256_sqr_montq
 	xor	r9,r9
 	mov	rax,r12
@@ -2789,7 +2789,7 @@
 	lea	rsi,[((0+32))+rsp]
 	mov	r15,QWORD[((16+32))+rsp]
 	mov	r8,QWORD[((24+32))+rsp]
-DB	102,72,15,126,199
+	movq	rdi,xmm0
 	call	__ecp_nistz256_sqr_montq
 
 	lea	rbx,[128+rsp]
@@ -2822,8 +2822,8 @@
 	lea	rdi,[rsp]
 	call	__ecp_nistz256_mul_montq
 
-DB	102,72,15,126,203
-DB	102,72,15,126,207
+	movq	rbx,xmm1
+	movq	rdi,xmm1
 	call	__ecp_nistz256_sub_fromq
 
 	lea	rsi,[((160+56))+rsp]
@@ -2915,7 +2915,7 @@
 	por	xmm5,xmm4
 	pxor	xmm4,xmm4
 	por	xmm1,xmm0
-DB	102,72,15,110,199
+	movq	xmm0,rdi
 
 	lea	rsi,[((64-0))+rsi]
 	mov	QWORD[((544+0))+rsp],rax
@@ -2938,7 +2938,7 @@
 	mov	r14,QWORD[((64+8))+rbx]
 	mov	r15,QWORD[((64+16))+rbx]
 	mov	r8,QWORD[((64+24))+rbx]
-DB	102,72,15,110,203
+	movq	xmm1,rbx
 
 	lea	rsi,[((64-0))+rbx]
 	lea	rdi,[32+rsp]
@@ -2993,7 +2993,7 @@
 	or	r12,r8
 	or	r12,r9
 	por	xmm2,xmm5
-DB	102,73,15,110,220
+	movq	xmm3,r12
 
 	mov	rax,QWORD[384+rsp]
 	lea	rbx,[384+rsp]
@@ -3023,8 +3023,8 @@
 	or	r12,r8
 	or	r12,r9
 
-DB	102,73,15,126,208
-DB	102,73,15,126,217
+	movq	r8,xmm2
+	movq	r9,xmm3
 	or	r12,r8
 	DB	0x3e
 	jnz	NEAR $L$add_proceedq
@@ -3039,7 +3039,7 @@
 
 
 
-DB	102,72,15,126,199
+	movq	rdi,xmm0
 	pxor	xmm0,xmm0
 	movdqu	XMMWORD[rdi],xmm0
 	movdqu	XMMWORD[16+rdi],xmm0
@@ -3051,8 +3051,8 @@
 
 ALIGN	32
 $L$add_doubleq:
-DB	102,72,15,126,206
-DB	102,72,15,126,199
+	movq	rsi,xmm1
+	movq	rdi,xmm0
 	add	rsp,416
 
 	jmp	NEAR $L$point_double_shortcutq
@@ -3188,7 +3188,7 @@
 	lea	rdi,[320+rsp]
 	call	__ecp_nistz256_sub_fromq
 
-DB	102,72,15,126,199
+	movq	rdi,xmm0
 
 	movdqa	xmm0,xmm5
 	movdqa	xmm1,xmm5
@@ -3345,7 +3345,7 @@
 	pshufd	xmm4,xmm5,0x1e
 	movdqa	XMMWORD[(416+16)+rsp],xmm1
 	por	xmm1,xmm0
-DB	102,72,15,110,199
+	movq	xmm0,rdi
 	movdqa	XMMWORD[448+rsp],xmm2
 	movdqa	XMMWORD[(448+16)+rsp],xmm3
 	por	xmm3,xmm2
@@ -3523,7 +3523,7 @@
 	lea	rdi,[256+rsp]
 	call	__ecp_nistz256_sub_fromq
 
-DB	102,72,15,126,199
+	movq	rdi,xmm0
 
 	movdqa	xmm0,xmm5
 	movdqa	xmm1,xmm5
@@ -3798,9 +3798,9 @@
 	movdqa	XMMWORD[(96+16)+rsp],xmm1
 	lea	r10,[32+rdi]
 	lea	r11,[64+rdi]
-DB	102,72,15,110,199
-DB	102,73,15,110,202
-DB	102,73,15,110,211
+	movq	xmm0,rdi
+	movq	xmm1,r10
+	movq	xmm2,r11
 
 	lea	rdi,[rsp]
 	call	__ecp_nistz256_mul_by_2x
@@ -3828,7 +3828,7 @@
 	mov	r12,QWORD[((64+24))+rbx]
 	lea	rsi,[((64-128))+rbx]
 	lea	rbx,[32+rbx]
-DB	102,72,15,126,215
+	movq	rdi,xmm2
 	call	__ecp_nistz256_mul_montx
 	call	__ecp_nistz256_mul_by_2x
 
@@ -3853,7 +3853,7 @@
 	lea	rsi,[((-128+0))+rsp]
 	mov	r15,QWORD[((16+0))+rsp]
 	mov	r8,QWORD[((24+0))+rsp]
-DB	102,72,15,126,207
+	movq	rdi,xmm1
 	call	__ecp_nistz256_sqr_montx
 	xor	r9,r9
 	mov	rax,r12
@@ -3928,7 +3928,7 @@
 	lea	rsi,[((-128+32))+rsp]
 	mov	r15,QWORD[((16+32))+rsp]
 	mov	r8,QWORD[((24+32))+rsp]
-DB	102,72,15,126,199
+	movq	rdi,xmm0
 	call	__ecp_nistz256_sqr_montx
 
 	lea	rbx,[128+rsp]
@@ -3961,8 +3961,8 @@
 	lea	rdi,[rsp]
 	call	__ecp_nistz256_mul_montx
 
-DB	102,72,15,126,203
-DB	102,72,15,126,207
+	movq	rbx,xmm1
+	movq	rdi,xmm1
 	call	__ecp_nistz256_sub_fromx
 
 	lea	rsi,[((160+56))+rsp]
@@ -4054,7 +4054,7 @@
 	por	xmm5,xmm4
 	pxor	xmm4,xmm4
 	por	xmm1,xmm0
-DB	102,72,15,110,199
+	movq	xmm0,rdi
 
 	lea	rsi,[((64-128))+rsi]
 	mov	QWORD[((544+0))+rsp],rdx
@@ -4077,7 +4077,7 @@
 	mov	r14,QWORD[((64+8))+rbx]
 	mov	r15,QWORD[((64+16))+rbx]
 	mov	r8,QWORD[((64+24))+rbx]
-DB	102,72,15,110,203
+	movq	xmm1,rbx
 
 	lea	rsi,[((64-128))+rbx]
 	lea	rdi,[32+rsp]
@@ -4132,7 +4132,7 @@
 	or	r12,r8
 	or	r12,r9
 	por	xmm2,xmm5
-DB	102,73,15,110,220
+	movq	xmm3,r12
 
 	mov	rdx,QWORD[384+rsp]
 	lea	rbx,[384+rsp]
@@ -4162,8 +4162,8 @@
 	or	r12,r8
 	or	r12,r9
 
-DB	102,73,15,126,208
-DB	102,73,15,126,217
+	movq	r8,xmm2
+	movq	r9,xmm3
 	or	r12,r8
 	DB	0x3e
 	jnz	NEAR $L$add_proceedx
@@ -4178,7 +4178,7 @@
 
 
 
-DB	102,72,15,126,199
+	movq	rdi,xmm0
 	pxor	xmm0,xmm0
 	movdqu	XMMWORD[rdi],xmm0
 	movdqu	XMMWORD[16+rdi],xmm0
@@ -4190,8 +4190,8 @@
 
 ALIGN	32
 $L$add_doublex:
-DB	102,72,15,126,206
-DB	102,72,15,126,199
+	movq	rsi,xmm1
+	movq	rdi,xmm0
 	add	rsp,416
 
 	jmp	NEAR $L$point_double_shortcutx
@@ -4327,7 +4327,7 @@
 	lea	rdi,[320+rsp]
 	call	__ecp_nistz256_sub_fromx
 
-DB	102,72,15,126,199
+	movq	rdi,xmm0
 
 	movdqa	xmm0,xmm5
 	movdqa	xmm1,xmm5
@@ -4484,7 +4484,7 @@
 	pshufd	xmm4,xmm5,0x1e
 	movdqa	XMMWORD[(416+16)+rsp],xmm1
 	por	xmm1,xmm0
-DB	102,72,15,110,199
+	movq	xmm0,rdi
 	movdqa	XMMWORD[448+rsp],xmm2
 	movdqa	XMMWORD[(448+16)+rsp],xmm3
 	por	xmm3,xmm2
@@ -4662,7 +4662,7 @@
 	lea	rdi,[256+rsp]
 	call	__ecp_nistz256_sub_fromx
 
-DB	102,72,15,126,199
+	movq	rdi,xmm0
 
 	movdqa	xmm0,xmm5
 	movdqa	xmm1,xmm5
diff --git a/gen/bcm/rdrand-x86_64-apple.S b/gen/bcm/rdrand-x86_64-apple.S
index 5fdf105..4f990d9 100644
--- a/gen/bcm/rdrand-x86_64-apple.S
+++ b/gen/bcm/rdrand-x86_64-apple.S
@@ -17,7 +17,7 @@
 
 _CET_ENDBR
 	xorq	%rax,%rax
-.byte	72,15,199,242
+	rdrand	%rdx
 
 	adcq	%rax,%rax
 	movq	%rdx,0(%rdi)
@@ -40,7 +40,7 @@
 	jz	L$out
 	movq	$8,%rdx
 L$loop:
-.byte	72,15,199,241
+	rdrand	%rcx
 	jnc	L$err
 	movq	%rcx,0(%rdi)
 	addq	%rdx,%rdi
diff --git a/gen/bcm/rdrand-x86_64-linux.S b/gen/bcm/rdrand-x86_64-linux.S
index fe81dac..52a1eb2 100644
--- a/gen/bcm/rdrand-x86_64-linux.S
+++ b/gen/bcm/rdrand-x86_64-linux.S
@@ -17,7 +17,7 @@
 .cfi_startproc	
 _CET_ENDBR
 	xorq	%rax,%rax
-.byte	72,15,199,242
+	rdrand	%rdx
 
 	adcq	%rax,%rax
 	movq	%rdx,0(%rdi)
@@ -40,7 +40,7 @@
 	jz	.Lout
 	movq	$8,%rdx
 .Lloop:
-.byte	72,15,199,241
+	rdrand	%rcx
 	jnc	.Lerr
 	movq	%rcx,0(%rdi)
 	addq	%rdx,%rdi
diff --git a/gen/bcm/rdrand-x86_64-win.asm b/gen/bcm/rdrand-x86_64-win.asm
index aae3d76..6dba87b 100644
--- a/gen/bcm/rdrand-x86_64-win.asm
+++ b/gen/bcm/rdrand-x86_64-win.asm
@@ -24,7 +24,7 @@
 
 _CET_ENDBR
 	xor	rax,rax
-DB	73,15,199,240
+	rdrand	r8
 
 	adc	rax,rax
 	mov	QWORD[rcx],r8
@@ -46,7 +46,7 @@
 	jz	NEAR $L$out
 	mov	r8,8
 $L$loop:
-DB	73,15,199,241
+	rdrand	r9
 	jnc	NEAR $L$err
 	mov	QWORD[rcx],r9
 	add	rcx,r8
diff --git a/gen/bcm/sha1-x86_64-apple.S b/gen/bcm/sha1-x86_64-apple.S
index a1ea1e6..32b3bc7 100644
--- a/gen/bcm/sha1-x86_64-apple.S
+++ b/gen/bcm/sha1-x86_64-apple.S
@@ -1259,12 +1259,12 @@
 	movdqu	16(%rsi),%xmm5
 	pshufd	$27,%xmm1,%xmm1
 	movdqu	32(%rsi),%xmm6
-.byte	102,15,56,0,227
+	pshufb	%xmm3,%xmm4
 	movdqu	48(%rsi),%xmm7
-.byte	102,15,56,0,235
-.byte	102,15,56,0,243
+	pshufb	%xmm3,%xmm5
+	pshufb	%xmm3,%xmm6
 	movdqa	%xmm1,%xmm9
-.byte	102,15,56,0,251
+	pshufb	%xmm3,%xmm7
 	jmp	L$oop_shaext
 
 .p2align	4
@@ -1275,133 +1275,133 @@
 	cmovneq	%r8,%rsi
 	prefetcht0	512(%rsi)
 	movdqa	%xmm0,%xmm8
-.byte	15,56,201,229
+	sha1msg1	%xmm5,%xmm4
 	movdqa	%xmm0,%xmm2
-.byte	15,58,204,193,0
-.byte	15,56,200,213
+	sha1rnds4	$0,%xmm1,%xmm0
+	sha1nexte	%xmm5,%xmm2
 	pxor	%xmm6,%xmm4
-.byte	15,56,201,238
-.byte	15,56,202,231
+	sha1msg1	%xmm6,%xmm5
+	sha1msg2	%xmm7,%xmm4
 
 	movdqa	%xmm0,%xmm1
-.byte	15,58,204,194,0
-.byte	15,56,200,206
+	sha1rnds4	$0,%xmm2,%xmm0
+	sha1nexte	%xmm6,%xmm1
 	pxor	%xmm7,%xmm5
-.byte	15,56,202,236
-.byte	15,56,201,247
+	sha1msg2	%xmm4,%xmm5
+	sha1msg1	%xmm7,%xmm6
 	movdqa	%xmm0,%xmm2
-.byte	15,58,204,193,0
-.byte	15,56,200,215
+	sha1rnds4	$0,%xmm1,%xmm0
+	sha1nexte	%xmm7,%xmm2
 	pxor	%xmm4,%xmm6
-.byte	15,56,201,252
-.byte	15,56,202,245
+	sha1msg1	%xmm4,%xmm7
+	sha1msg2	%xmm5,%xmm6
 
 	movdqa	%xmm0,%xmm1
-.byte	15,58,204,194,0
-.byte	15,56,200,204
+	sha1rnds4	$0,%xmm2,%xmm0
+	sha1nexte	%xmm4,%xmm1
 	pxor	%xmm5,%xmm7
-.byte	15,56,202,254
-.byte	15,56,201,229
+	sha1msg2	%xmm6,%xmm7
+	sha1msg1	%xmm5,%xmm4
 	movdqa	%xmm0,%xmm2
-.byte	15,58,204,193,0
-.byte	15,56,200,213
+	sha1rnds4	$0,%xmm1,%xmm0
+	sha1nexte	%xmm5,%xmm2
 	pxor	%xmm6,%xmm4
-.byte	15,56,201,238
-.byte	15,56,202,231
+	sha1msg1	%xmm6,%xmm5
+	sha1msg2	%xmm7,%xmm4
 
 	movdqa	%xmm0,%xmm1
-.byte	15,58,204,194,1
-.byte	15,56,200,206
+	sha1rnds4	$1,%xmm2,%xmm0
+	sha1nexte	%xmm6,%xmm1
 	pxor	%xmm7,%xmm5
-.byte	15,56,202,236
-.byte	15,56,201,247
+	sha1msg2	%xmm4,%xmm5
+	sha1msg1	%xmm7,%xmm6
 	movdqa	%xmm0,%xmm2
-.byte	15,58,204,193,1
-.byte	15,56,200,215
+	sha1rnds4	$1,%xmm1,%xmm0
+	sha1nexte	%xmm7,%xmm2
 	pxor	%xmm4,%xmm6
-.byte	15,56,201,252
-.byte	15,56,202,245
+	sha1msg1	%xmm4,%xmm7
+	sha1msg2	%xmm5,%xmm6
 
 	movdqa	%xmm0,%xmm1
-.byte	15,58,204,194,1
-.byte	15,56,200,204
+	sha1rnds4	$1,%xmm2,%xmm0
+	sha1nexte	%xmm4,%xmm1
 	pxor	%xmm5,%xmm7
-.byte	15,56,202,254
-.byte	15,56,201,229
+	sha1msg2	%xmm6,%xmm7
+	sha1msg1	%xmm5,%xmm4
 	movdqa	%xmm0,%xmm2
-.byte	15,58,204,193,1
-.byte	15,56,200,213
+	sha1rnds4	$1,%xmm1,%xmm0
+	sha1nexte	%xmm5,%xmm2
 	pxor	%xmm6,%xmm4
-.byte	15,56,201,238
-.byte	15,56,202,231
+	sha1msg1	%xmm6,%xmm5
+	sha1msg2	%xmm7,%xmm4
 
 	movdqa	%xmm0,%xmm1
-.byte	15,58,204,194,1
-.byte	15,56,200,206
+	sha1rnds4	$1,%xmm2,%xmm0
+	sha1nexte	%xmm6,%xmm1
 	pxor	%xmm7,%xmm5
-.byte	15,56,202,236
-.byte	15,56,201,247
+	sha1msg2	%xmm4,%xmm5
+	sha1msg1	%xmm7,%xmm6
 	movdqa	%xmm0,%xmm2
-.byte	15,58,204,193,2
-.byte	15,56,200,215
+	sha1rnds4	$2,%xmm1,%xmm0
+	sha1nexte	%xmm7,%xmm2
 	pxor	%xmm4,%xmm6
-.byte	15,56,201,252
-.byte	15,56,202,245
+	sha1msg1	%xmm4,%xmm7
+	sha1msg2	%xmm5,%xmm6
 
 	movdqa	%xmm0,%xmm1
-.byte	15,58,204,194,2
-.byte	15,56,200,204
+	sha1rnds4	$2,%xmm2,%xmm0
+	sha1nexte	%xmm4,%xmm1
 	pxor	%xmm5,%xmm7
-.byte	15,56,202,254
-.byte	15,56,201,229
+	sha1msg2	%xmm6,%xmm7
+	sha1msg1	%xmm5,%xmm4
 	movdqa	%xmm0,%xmm2
-.byte	15,58,204,193,2
-.byte	15,56,200,213
+	sha1rnds4	$2,%xmm1,%xmm0
+	sha1nexte	%xmm5,%xmm2
 	pxor	%xmm6,%xmm4
-.byte	15,56,201,238
-.byte	15,56,202,231
+	sha1msg1	%xmm6,%xmm5
+	sha1msg2	%xmm7,%xmm4
 
 	movdqa	%xmm0,%xmm1
-.byte	15,58,204,194,2
-.byte	15,56,200,206
+	sha1rnds4	$2,%xmm2,%xmm0
+	sha1nexte	%xmm6,%xmm1
 	pxor	%xmm7,%xmm5
-.byte	15,56,202,236
-.byte	15,56,201,247
+	sha1msg2	%xmm4,%xmm5
+	sha1msg1	%xmm7,%xmm6
 	movdqa	%xmm0,%xmm2
-.byte	15,58,204,193,2
-.byte	15,56,200,215
+	sha1rnds4	$2,%xmm1,%xmm0
+	sha1nexte	%xmm7,%xmm2
 	pxor	%xmm4,%xmm6
-.byte	15,56,201,252
-.byte	15,56,202,245
+	sha1msg1	%xmm4,%xmm7
+	sha1msg2	%xmm5,%xmm6
 
 	movdqa	%xmm0,%xmm1
-.byte	15,58,204,194,3
-.byte	15,56,200,204
+	sha1rnds4	$3,%xmm2,%xmm0
+	sha1nexte	%xmm4,%xmm1
 	pxor	%xmm5,%xmm7
-.byte	15,56,202,254
+	sha1msg2	%xmm6,%xmm7
 	movdqu	(%rsi),%xmm4
 	movdqa	%xmm0,%xmm2
-.byte	15,58,204,193,3
-.byte	15,56,200,213
+	sha1rnds4	$3,%xmm1,%xmm0
+	sha1nexte	%xmm5,%xmm2
 	movdqu	16(%rsi),%xmm5
-.byte	102,15,56,0,227
+	pshufb	%xmm3,%xmm4
 
 	movdqa	%xmm0,%xmm1
-.byte	15,58,204,194,3
-.byte	15,56,200,206
+	sha1rnds4	$3,%xmm2,%xmm0
+	sha1nexte	%xmm6,%xmm1
 	movdqu	32(%rsi),%xmm6
-.byte	102,15,56,0,235
+	pshufb	%xmm3,%xmm5
 
 	movdqa	%xmm0,%xmm2
-.byte	15,58,204,193,3
-.byte	15,56,200,215
+	sha1rnds4	$3,%xmm1,%xmm0
+	sha1nexte	%xmm7,%xmm2
 	movdqu	48(%rsi),%xmm7
-.byte	102,15,56,0,243
+	pshufb	%xmm3,%xmm6
 
 	movdqa	%xmm0,%xmm1
-.byte	15,58,204,194,3
-.byte	65,15,56,200,201
-.byte	102,15,56,0,251
+	sha1rnds4	$3,%xmm2,%xmm0
+	sha1nexte	%xmm9,%xmm1
+	pshufb	%xmm3,%xmm7
 
 	paddd	%xmm8,%xmm0
 	movdqa	%xmm1,%xmm9
@@ -1460,12 +1460,12 @@
 	movdqu	16(%r9),%xmm1
 	movdqu	32(%r9),%xmm2
 	movdqu	48(%r9),%xmm3
-.byte	102,15,56,0,198
-.byte	102,15,56,0,206
-.byte	102,15,56,0,214
+	pshufb	%xmm6,%xmm0
+	pshufb	%xmm6,%xmm1
+	pshufb	%xmm6,%xmm2
 	addq	$64,%r9
 	paddd	%xmm9,%xmm0
-.byte	102,15,56,0,222
+	pshufb	%xmm6,%xmm3
 	paddd	%xmm9,%xmm1
 	paddd	%xmm9,%xmm2
 	movdqa	%xmm0,0(%rsp)
@@ -2357,12 +2357,12 @@
 	movdqu	16(%r9),%xmm1
 	movdqu	32(%r9),%xmm2
 	movdqu	48(%r9),%xmm3
-.byte	102,15,56,0,198
+	pshufb	%xmm6,%xmm0
 	addq	$64,%r9
 	addl	16(%rsp),%ebx
 	xorl	%ebp,%esi
 	movl	%ecx,%edi
-.byte	102,15,56,0,206
+	pshufb	%xmm6,%xmm1
 	roll	$5,%ecx
 	addl	%esi,%ebx
 	xorl	%ebp,%edi
@@ -2398,7 +2398,7 @@
 	addl	32(%rsp),%ecx
 	xorl	%eax,%esi
 	movl	%edx,%edi
-.byte	102,15,56,0,214
+	pshufb	%xmm6,%xmm2
 	roll	$5,%edx
 	addl	%esi,%ecx
 	xorl	%eax,%edi
@@ -2434,7 +2434,7 @@
 	addl	48(%rsp),%edx
 	xorl	%ebx,%esi
 	movl	%ebp,%edi
-.byte	102,15,56,0,222
+	pshufb	%xmm6,%xmm3
 	roll	$5,%ebp
 	addl	%esi,%edx
 	xorl	%ebx,%edi
diff --git a/gen/bcm/sha1-x86_64-linux.S b/gen/bcm/sha1-x86_64-linux.S
index 39d9ad3..1f4807a 100644
--- a/gen/bcm/sha1-x86_64-linux.S
+++ b/gen/bcm/sha1-x86_64-linux.S
@@ -1259,12 +1259,12 @@
 	movdqu	16(%rsi),%xmm5
 	pshufd	$27,%xmm1,%xmm1
 	movdqu	32(%rsi),%xmm6
-.byte	102,15,56,0,227
+	pshufb	%xmm3,%xmm4
 	movdqu	48(%rsi),%xmm7
-.byte	102,15,56,0,235
-.byte	102,15,56,0,243
+	pshufb	%xmm3,%xmm5
+	pshufb	%xmm3,%xmm6
 	movdqa	%xmm1,%xmm9
-.byte	102,15,56,0,251
+	pshufb	%xmm3,%xmm7
 	jmp	.Loop_shaext
 
 .align	16
@@ -1275,133 +1275,133 @@
 	cmovneq	%r8,%rsi
 	prefetcht0	512(%rsi)
 	movdqa	%xmm0,%xmm8
-.byte	15,56,201,229
+	sha1msg1	%xmm5,%xmm4
 	movdqa	%xmm0,%xmm2
-.byte	15,58,204,193,0
-.byte	15,56,200,213
+	sha1rnds4	$0,%xmm1,%xmm0
+	sha1nexte	%xmm5,%xmm2
 	pxor	%xmm6,%xmm4
-.byte	15,56,201,238
-.byte	15,56,202,231
+	sha1msg1	%xmm6,%xmm5
+	sha1msg2	%xmm7,%xmm4
 
 	movdqa	%xmm0,%xmm1
-.byte	15,58,204,194,0
-.byte	15,56,200,206
+	sha1rnds4	$0,%xmm2,%xmm0
+	sha1nexte	%xmm6,%xmm1
 	pxor	%xmm7,%xmm5
-.byte	15,56,202,236
-.byte	15,56,201,247
+	sha1msg2	%xmm4,%xmm5
+	sha1msg1	%xmm7,%xmm6
 	movdqa	%xmm0,%xmm2
-.byte	15,58,204,193,0
-.byte	15,56,200,215
+	sha1rnds4	$0,%xmm1,%xmm0
+	sha1nexte	%xmm7,%xmm2
 	pxor	%xmm4,%xmm6
-.byte	15,56,201,252
-.byte	15,56,202,245
+	sha1msg1	%xmm4,%xmm7
+	sha1msg2	%xmm5,%xmm6
 
 	movdqa	%xmm0,%xmm1
-.byte	15,58,204,194,0
-.byte	15,56,200,204
+	sha1rnds4	$0,%xmm2,%xmm0
+	sha1nexte	%xmm4,%xmm1
 	pxor	%xmm5,%xmm7
-.byte	15,56,202,254
-.byte	15,56,201,229
+	sha1msg2	%xmm6,%xmm7
+	sha1msg1	%xmm5,%xmm4
 	movdqa	%xmm0,%xmm2
-.byte	15,58,204,193,0
-.byte	15,56,200,213
+	sha1rnds4	$0,%xmm1,%xmm0
+	sha1nexte	%xmm5,%xmm2
 	pxor	%xmm6,%xmm4
-.byte	15,56,201,238
-.byte	15,56,202,231
+	sha1msg1	%xmm6,%xmm5
+	sha1msg2	%xmm7,%xmm4
 
 	movdqa	%xmm0,%xmm1
-.byte	15,58,204,194,1
-.byte	15,56,200,206
+	sha1rnds4	$1,%xmm2,%xmm0
+	sha1nexte	%xmm6,%xmm1
 	pxor	%xmm7,%xmm5
-.byte	15,56,202,236
-.byte	15,56,201,247
+	sha1msg2	%xmm4,%xmm5
+	sha1msg1	%xmm7,%xmm6
 	movdqa	%xmm0,%xmm2
-.byte	15,58,204,193,1
-.byte	15,56,200,215
+	sha1rnds4	$1,%xmm1,%xmm0
+	sha1nexte	%xmm7,%xmm2
 	pxor	%xmm4,%xmm6
-.byte	15,56,201,252
-.byte	15,56,202,245
+	sha1msg1	%xmm4,%xmm7
+	sha1msg2	%xmm5,%xmm6
 
 	movdqa	%xmm0,%xmm1
-.byte	15,58,204,194,1
-.byte	15,56,200,204
+	sha1rnds4	$1,%xmm2,%xmm0
+	sha1nexte	%xmm4,%xmm1
 	pxor	%xmm5,%xmm7
-.byte	15,56,202,254
-.byte	15,56,201,229
+	sha1msg2	%xmm6,%xmm7
+	sha1msg1	%xmm5,%xmm4
 	movdqa	%xmm0,%xmm2
-.byte	15,58,204,193,1
-.byte	15,56,200,213
+	sha1rnds4	$1,%xmm1,%xmm0
+	sha1nexte	%xmm5,%xmm2
 	pxor	%xmm6,%xmm4
-.byte	15,56,201,238
-.byte	15,56,202,231
+	sha1msg1	%xmm6,%xmm5
+	sha1msg2	%xmm7,%xmm4
 
 	movdqa	%xmm0,%xmm1
-.byte	15,58,204,194,1
-.byte	15,56,200,206
+	sha1rnds4	$1,%xmm2,%xmm0
+	sha1nexte	%xmm6,%xmm1
 	pxor	%xmm7,%xmm5
-.byte	15,56,202,236
-.byte	15,56,201,247
+	sha1msg2	%xmm4,%xmm5
+	sha1msg1	%xmm7,%xmm6
 	movdqa	%xmm0,%xmm2
-.byte	15,58,204,193,2
-.byte	15,56,200,215
+	sha1rnds4	$2,%xmm1,%xmm0
+	sha1nexte	%xmm7,%xmm2
 	pxor	%xmm4,%xmm6
-.byte	15,56,201,252
-.byte	15,56,202,245
+	sha1msg1	%xmm4,%xmm7
+	sha1msg2	%xmm5,%xmm6
 
 	movdqa	%xmm0,%xmm1
-.byte	15,58,204,194,2
-.byte	15,56,200,204
+	sha1rnds4	$2,%xmm2,%xmm0
+	sha1nexte	%xmm4,%xmm1
 	pxor	%xmm5,%xmm7
-.byte	15,56,202,254
-.byte	15,56,201,229
+	sha1msg2	%xmm6,%xmm7
+	sha1msg1	%xmm5,%xmm4
 	movdqa	%xmm0,%xmm2
-.byte	15,58,204,193,2
-.byte	15,56,200,213
+	sha1rnds4	$2,%xmm1,%xmm0
+	sha1nexte	%xmm5,%xmm2
 	pxor	%xmm6,%xmm4
-.byte	15,56,201,238
-.byte	15,56,202,231
+	sha1msg1	%xmm6,%xmm5
+	sha1msg2	%xmm7,%xmm4
 
 	movdqa	%xmm0,%xmm1
-.byte	15,58,204,194,2
-.byte	15,56,200,206
+	sha1rnds4	$2,%xmm2,%xmm0
+	sha1nexte	%xmm6,%xmm1
 	pxor	%xmm7,%xmm5
-.byte	15,56,202,236
-.byte	15,56,201,247
+	sha1msg2	%xmm4,%xmm5
+	sha1msg1	%xmm7,%xmm6
 	movdqa	%xmm0,%xmm2
-.byte	15,58,204,193,2
-.byte	15,56,200,215
+	sha1rnds4	$2,%xmm1,%xmm0
+	sha1nexte	%xmm7,%xmm2
 	pxor	%xmm4,%xmm6
-.byte	15,56,201,252
-.byte	15,56,202,245
+	sha1msg1	%xmm4,%xmm7
+	sha1msg2	%xmm5,%xmm6
 
 	movdqa	%xmm0,%xmm1
-.byte	15,58,204,194,3
-.byte	15,56,200,204
+	sha1rnds4	$3,%xmm2,%xmm0
+	sha1nexte	%xmm4,%xmm1
 	pxor	%xmm5,%xmm7
-.byte	15,56,202,254
+	sha1msg2	%xmm6,%xmm7
 	movdqu	(%rsi),%xmm4
 	movdqa	%xmm0,%xmm2
-.byte	15,58,204,193,3
-.byte	15,56,200,213
+	sha1rnds4	$3,%xmm1,%xmm0
+	sha1nexte	%xmm5,%xmm2
 	movdqu	16(%rsi),%xmm5
-.byte	102,15,56,0,227
+	pshufb	%xmm3,%xmm4
 
 	movdqa	%xmm0,%xmm1
-.byte	15,58,204,194,3
-.byte	15,56,200,206
+	sha1rnds4	$3,%xmm2,%xmm0
+	sha1nexte	%xmm6,%xmm1
 	movdqu	32(%rsi),%xmm6
-.byte	102,15,56,0,235
+	pshufb	%xmm3,%xmm5
 
 	movdqa	%xmm0,%xmm2
-.byte	15,58,204,193,3
-.byte	15,56,200,215
+	sha1rnds4	$3,%xmm1,%xmm0
+	sha1nexte	%xmm7,%xmm2
 	movdqu	48(%rsi),%xmm7
-.byte	102,15,56,0,243
+	pshufb	%xmm3,%xmm6
 
 	movdqa	%xmm0,%xmm1
-.byte	15,58,204,194,3
-.byte	65,15,56,200,201
-.byte	102,15,56,0,251
+	sha1rnds4	$3,%xmm2,%xmm0
+	sha1nexte	%xmm9,%xmm1
+	pshufb	%xmm3,%xmm7
 
 	paddd	%xmm8,%xmm0
 	movdqa	%xmm1,%xmm9
@@ -1460,12 +1460,12 @@
 	movdqu	16(%r9),%xmm1
 	movdqu	32(%r9),%xmm2
 	movdqu	48(%r9),%xmm3
-.byte	102,15,56,0,198
-.byte	102,15,56,0,206
-.byte	102,15,56,0,214
+	pshufb	%xmm6,%xmm0
+	pshufb	%xmm6,%xmm1
+	pshufb	%xmm6,%xmm2
 	addq	$64,%r9
 	paddd	%xmm9,%xmm0
-.byte	102,15,56,0,222
+	pshufb	%xmm6,%xmm3
 	paddd	%xmm9,%xmm1
 	paddd	%xmm9,%xmm2
 	movdqa	%xmm0,0(%rsp)
@@ -2357,12 +2357,12 @@
 	movdqu	16(%r9),%xmm1
 	movdqu	32(%r9),%xmm2
 	movdqu	48(%r9),%xmm3
-.byte	102,15,56,0,198
+	pshufb	%xmm6,%xmm0
 	addq	$64,%r9
 	addl	16(%rsp),%ebx
 	xorl	%ebp,%esi
 	movl	%ecx,%edi
-.byte	102,15,56,0,206
+	pshufb	%xmm6,%xmm1
 	roll	$5,%ecx
 	addl	%esi,%ebx
 	xorl	%ebp,%edi
@@ -2398,7 +2398,7 @@
 	addl	32(%rsp),%ecx
 	xorl	%eax,%esi
 	movl	%edx,%edi
-.byte	102,15,56,0,214
+	pshufb	%xmm6,%xmm2
 	roll	$5,%edx
 	addl	%esi,%ecx
 	xorl	%eax,%edi
@@ -2434,7 +2434,7 @@
 	addl	48(%rsp),%edx
 	xorl	%ebx,%esi
 	movl	%ebp,%edi
-.byte	102,15,56,0,222
+	pshufb	%xmm6,%xmm3
 	roll	$5,%ebp
 	addl	%esi,%edx
 	xorl	%ebx,%edi
diff --git a/gen/bcm/sha1-x86_64-win.asm b/gen/bcm/sha1-x86_64-win.asm
index 92e9b9c..c5da333 100644
--- a/gen/bcm/sha1-x86_64-win.asm
+++ b/gen/bcm/sha1-x86_64-win.asm
@@ -1291,12 +1291,12 @@
 	movdqu	xmm5,XMMWORD[16+rsi]
 	pshufd	xmm1,xmm1,27
 	movdqu	xmm6,XMMWORD[32+rsi]
-DB	102,15,56,0,227
+	pshufb	xmm4,xmm3
 	movdqu	xmm7,XMMWORD[48+rsi]
-DB	102,15,56,0,235
-DB	102,15,56,0,243
+	pshufb	xmm5,xmm3
+	pshufb	xmm6,xmm3
 	movdqa	xmm9,xmm1
-DB	102,15,56,0,251
+	pshufb	xmm7,xmm3
 	jmp	NEAR $L$oop_shaext
 
 ALIGN	16
@@ -1307,133 +1307,133 @@
 	cmovne	rsi,r8
 	prefetcht0	[512+rsi]
 	movdqa	xmm8,xmm0
-	DB	15,56,201,229
+	sha1msg1	xmm4,xmm5
 	movdqa	xmm2,xmm0
-	DB	15,58,204,193,0
-	DB	15,56,200,213
+	sha1rnds4	xmm0,xmm1,0
+	sha1nexte	xmm2,xmm5
 	pxor	xmm4,xmm6
-	DB	15,56,201,238
-	DB	15,56,202,231
+	sha1msg1	xmm5,xmm6
+	sha1msg2	xmm4,xmm7
 
 	movdqa	xmm1,xmm0
-	DB	15,58,204,194,0
-	DB	15,56,200,206
+	sha1rnds4	xmm0,xmm2,0
+	sha1nexte	xmm1,xmm6
 	pxor	xmm5,xmm7
-	DB	15,56,202,236
-	DB	15,56,201,247
+	sha1msg2	xmm5,xmm4
+	sha1msg1	xmm6,xmm7
 	movdqa	xmm2,xmm0
-	DB	15,58,204,193,0
-	DB	15,56,200,215
+	sha1rnds4	xmm0,xmm1,0
+	sha1nexte	xmm2,xmm7
 	pxor	xmm6,xmm4
-	DB	15,56,201,252
-	DB	15,56,202,245
+	sha1msg1	xmm7,xmm4
+	sha1msg2	xmm6,xmm5
 
 	movdqa	xmm1,xmm0
-	DB	15,58,204,194,0
-	DB	15,56,200,204
+	sha1rnds4	xmm0,xmm2,0
+	sha1nexte	xmm1,xmm4
 	pxor	xmm7,xmm5
-	DB	15,56,202,254
-	DB	15,56,201,229
+	sha1msg2	xmm7,xmm6
+	sha1msg1	xmm4,xmm5
 	movdqa	xmm2,xmm0
-	DB	15,58,204,193,0
-	DB	15,56,200,213
+	sha1rnds4	xmm0,xmm1,0
+	sha1nexte	xmm2,xmm5
 	pxor	xmm4,xmm6
-	DB	15,56,201,238
-	DB	15,56,202,231
+	sha1msg1	xmm5,xmm6
+	sha1msg2	xmm4,xmm7
 
 	movdqa	xmm1,xmm0
-	DB	15,58,204,194,1
-	DB	15,56,200,206
+	sha1rnds4	xmm0,xmm2,1
+	sha1nexte	xmm1,xmm6
 	pxor	xmm5,xmm7
-	DB	15,56,202,236
-	DB	15,56,201,247
+	sha1msg2	xmm5,xmm4
+	sha1msg1	xmm6,xmm7
 	movdqa	xmm2,xmm0
-	DB	15,58,204,193,1
-	DB	15,56,200,215
+	sha1rnds4	xmm0,xmm1,1
+	sha1nexte	xmm2,xmm7
 	pxor	xmm6,xmm4
-	DB	15,56,201,252
-	DB	15,56,202,245
+	sha1msg1	xmm7,xmm4
+	sha1msg2	xmm6,xmm5
 
 	movdqa	xmm1,xmm0
-	DB	15,58,204,194,1
-	DB	15,56,200,204
+	sha1rnds4	xmm0,xmm2,1
+	sha1nexte	xmm1,xmm4
 	pxor	xmm7,xmm5
-	DB	15,56,202,254
-	DB	15,56,201,229
+	sha1msg2	xmm7,xmm6
+	sha1msg1	xmm4,xmm5
 	movdqa	xmm2,xmm0
-	DB	15,58,204,193,1
-	DB	15,56,200,213
+	sha1rnds4	xmm0,xmm1,1
+	sha1nexte	xmm2,xmm5
 	pxor	xmm4,xmm6
-	DB	15,56,201,238
-	DB	15,56,202,231
+	sha1msg1	xmm5,xmm6
+	sha1msg2	xmm4,xmm7
 
 	movdqa	xmm1,xmm0
-	DB	15,58,204,194,1
-	DB	15,56,200,206
+	sha1rnds4	xmm0,xmm2,1
+	sha1nexte	xmm1,xmm6
 	pxor	xmm5,xmm7
-	DB	15,56,202,236
-	DB	15,56,201,247
+	sha1msg2	xmm5,xmm4
+	sha1msg1	xmm6,xmm7
 	movdqa	xmm2,xmm0
-	DB	15,58,204,193,2
-	DB	15,56,200,215
+	sha1rnds4	xmm0,xmm1,2
+	sha1nexte	xmm2,xmm7
 	pxor	xmm6,xmm4
-	DB	15,56,201,252
-	DB	15,56,202,245
+	sha1msg1	xmm7,xmm4
+	sha1msg2	xmm6,xmm5
 
 	movdqa	xmm1,xmm0
-	DB	15,58,204,194,2
-	DB	15,56,200,204
+	sha1rnds4	xmm0,xmm2,2
+	sha1nexte	xmm1,xmm4
 	pxor	xmm7,xmm5
-	DB	15,56,202,254
-	DB	15,56,201,229
+	sha1msg2	xmm7,xmm6
+	sha1msg1	xmm4,xmm5
 	movdqa	xmm2,xmm0
-	DB	15,58,204,193,2
-	DB	15,56,200,213
+	sha1rnds4	xmm0,xmm1,2
+	sha1nexte	xmm2,xmm5
 	pxor	xmm4,xmm6
-	DB	15,56,201,238
-	DB	15,56,202,231
+	sha1msg1	xmm5,xmm6
+	sha1msg2	xmm4,xmm7
 
 	movdqa	xmm1,xmm0
-	DB	15,58,204,194,2
-	DB	15,56,200,206
+	sha1rnds4	xmm0,xmm2,2
+	sha1nexte	xmm1,xmm6
 	pxor	xmm5,xmm7
-	DB	15,56,202,236
-	DB	15,56,201,247
+	sha1msg2	xmm5,xmm4
+	sha1msg1	xmm6,xmm7
 	movdqa	xmm2,xmm0
-	DB	15,58,204,193,2
-	DB	15,56,200,215
+	sha1rnds4	xmm0,xmm1,2
+	sha1nexte	xmm2,xmm7
 	pxor	xmm6,xmm4
-	DB	15,56,201,252
-	DB	15,56,202,245
+	sha1msg1	xmm7,xmm4
+	sha1msg2	xmm6,xmm5
 
 	movdqa	xmm1,xmm0
-	DB	15,58,204,194,3
-	DB	15,56,200,204
+	sha1rnds4	xmm0,xmm2,3
+	sha1nexte	xmm1,xmm4
 	pxor	xmm7,xmm5
-	DB	15,56,202,254
+	sha1msg2	xmm7,xmm6
 	movdqu	xmm4,XMMWORD[rsi]
 	movdqa	xmm2,xmm0
-	DB	15,58,204,193,3
-	DB	15,56,200,213
+	sha1rnds4	xmm0,xmm1,3
+	sha1nexte	xmm2,xmm5
 	movdqu	xmm5,XMMWORD[16+rsi]
-DB	102,15,56,0,227
+	pshufb	xmm4,xmm3
 
 	movdqa	xmm1,xmm0
-	DB	15,58,204,194,3
-	DB	15,56,200,206
+	sha1rnds4	xmm0,xmm2,3
+	sha1nexte	xmm1,xmm6
 	movdqu	xmm6,XMMWORD[32+rsi]
-DB	102,15,56,0,235
+	pshufb	xmm5,xmm3
 
 	movdqa	xmm2,xmm0
-	DB	15,58,204,193,3
-	DB	15,56,200,215
+	sha1rnds4	xmm0,xmm1,3
+	sha1nexte	xmm2,xmm7
 	movdqu	xmm7,XMMWORD[48+rsi]
-DB	102,15,56,0,243
+	pshufb	xmm6,xmm3
 
 	movdqa	xmm1,xmm0
-	DB	15,58,204,194,3
-	DB	65,15,56,200,201
-DB	102,15,56,0,251
+	sha1rnds4	xmm0,xmm2,3
+	sha1nexte	xmm1,xmm9
+	pshufb	xmm7,xmm3
 
 	paddd	xmm0,xmm8
 	movdqa	xmm9,xmm1
@@ -1515,12 +1515,12 @@
 	movdqu	xmm1,XMMWORD[16+r9]
 	movdqu	xmm2,XMMWORD[32+r9]
 	movdqu	xmm3,XMMWORD[48+r9]
-DB	102,15,56,0,198
-DB	102,15,56,0,206
-DB	102,15,56,0,214
+	pshufb	xmm0,xmm6
+	pshufb	xmm1,xmm6
+	pshufb	xmm2,xmm6
 	add	r9,64
 	paddd	xmm0,xmm9
-DB	102,15,56,0,222
+	pshufb	xmm3,xmm6
 	paddd	xmm1,xmm9
 	paddd	xmm2,xmm9
 	movdqa	XMMWORD[rsp],xmm0
@@ -2412,12 +2412,12 @@
 	movdqu	xmm1,XMMWORD[16+r9]
 	movdqu	xmm2,XMMWORD[32+r9]
 	movdqu	xmm3,XMMWORD[48+r9]
-DB	102,15,56,0,198
+	pshufb	xmm0,xmm6
 	add	r9,64
 	add	ebx,DWORD[16+rsp]
 	xor	esi,ebp
 	mov	edi,ecx
-DB	102,15,56,0,206
+	pshufb	xmm1,xmm6
 	rol	ecx,5
 	add	ebx,esi
 	xor	edi,ebp
@@ -2453,7 +2453,7 @@
 	add	ecx,DWORD[32+rsp]
 	xor	esi,eax
 	mov	edi,edx
-DB	102,15,56,0,214
+	pshufb	xmm2,xmm6
 	rol	edx,5
 	add	ecx,esi
 	xor	edi,eax
@@ -2489,7 +2489,7 @@
 	add	edx,DWORD[48+rsp]
 	xor	esi,ebx
 	mov	edi,ebp
-DB	102,15,56,0,222
+	pshufb	xmm3,xmm6
 	rol	ebp,5
 	add	edx,esi
 	xor	edi,ebx
diff --git a/gen/bcm/sha256-x86_64-apple.S b/gen/bcm/sha256-x86_64-apple.S
index b33f807..367f0d3 100644
--- a/gen/bcm/sha256-x86_64-apple.S
+++ b/gen/bcm/sha256-x86_64-apple.S
@@ -1780,7 +1780,7 @@
 	pshufd	$0xb1,%xmm1,%xmm1
 	pshufd	$0x1b,%xmm2,%xmm2
 	movdqa	%xmm7,%xmm8
-.byte	102,15,58,15,202,8
+	palignr	$8,%xmm2,%xmm1
 	punpcklqdq	%xmm0,%xmm2
 	jmp	L$oop_shaext
 
@@ -1789,176 +1789,176 @@
 	movdqu	(%rsi),%xmm3
 	movdqu	16(%rsi),%xmm4
 	movdqu	32(%rsi),%xmm5
-.byte	102,15,56,0,223
+	pshufb	%xmm7,%xmm3
 	movdqu	48(%rsi),%xmm6
 
 	movdqa	0-128(%rcx),%xmm0
 	paddd	%xmm3,%xmm0
-.byte	102,15,56,0,231
+	pshufb	%xmm7,%xmm4
 	movdqa	%xmm2,%xmm10
-.byte	15,56,203,209
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
 	nop
 	movdqa	%xmm1,%xmm9
-.byte	15,56,203,202
+	sha256rnds2	%xmm2,%xmm1
 
 	movdqa	32-128(%rcx),%xmm0
 	paddd	%xmm4,%xmm0
-.byte	102,15,56,0,239
-.byte	15,56,203,209
+	pshufb	%xmm7,%xmm5
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
 	leaq	64(%rsi),%rsi
-.byte	15,56,204,220
-.byte	15,56,203,202
+	sha256msg1	%xmm4,%xmm3
+	sha256rnds2	%xmm2,%xmm1
 
 	movdqa	64-128(%rcx),%xmm0
 	paddd	%xmm5,%xmm0
-.byte	102,15,56,0,247
-.byte	15,56,203,209
+	pshufb	%xmm7,%xmm6
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
 	movdqa	%xmm6,%xmm7
-.byte	102,15,58,15,253,4
+	palignr	$4,%xmm5,%xmm7
 	nop
 	paddd	%xmm7,%xmm3
-.byte	15,56,204,229
-.byte	15,56,203,202
+	sha256msg1	%xmm5,%xmm4
+	sha256rnds2	%xmm2,%xmm1
 
 	movdqa	96-128(%rcx),%xmm0
 	paddd	%xmm6,%xmm0
-.byte	15,56,205,222
-.byte	15,56,203,209
+	sha256msg2	%xmm6,%xmm3
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
 	movdqa	%xmm3,%xmm7
-.byte	102,15,58,15,254,4
+	palignr	$4,%xmm6,%xmm7
 	nop
 	paddd	%xmm7,%xmm4
-.byte	15,56,204,238
-.byte	15,56,203,202
+	sha256msg1	%xmm6,%xmm5
+	sha256rnds2	%xmm2,%xmm1
 	movdqa	128-128(%rcx),%xmm0
 	paddd	%xmm3,%xmm0
-.byte	15,56,205,227
-.byte	15,56,203,209
+	sha256msg2	%xmm3,%xmm4
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
 	movdqa	%xmm4,%xmm7
-.byte	102,15,58,15,251,4
+	palignr	$4,%xmm3,%xmm7
 	nop
 	paddd	%xmm7,%xmm5
-.byte	15,56,204,243
-.byte	15,56,203,202
+	sha256msg1	%xmm3,%xmm6
+	sha256rnds2	%xmm2,%xmm1
 	movdqa	160-128(%rcx),%xmm0
 	paddd	%xmm4,%xmm0
-.byte	15,56,205,236
-.byte	15,56,203,209
+	sha256msg2	%xmm4,%xmm5
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
 	movdqa	%xmm5,%xmm7
-.byte	102,15,58,15,252,4
+	palignr	$4,%xmm4,%xmm7
 	nop
 	paddd	%xmm7,%xmm6
-.byte	15,56,204,220
-.byte	15,56,203,202
+	sha256msg1	%xmm4,%xmm3
+	sha256rnds2	%xmm2,%xmm1
 	movdqa	192-128(%rcx),%xmm0
 	paddd	%xmm5,%xmm0
-.byte	15,56,205,245
-.byte	15,56,203,209
+	sha256msg2	%xmm5,%xmm6
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
 	movdqa	%xmm6,%xmm7
-.byte	102,15,58,15,253,4
+	palignr	$4,%xmm5,%xmm7
 	nop
 	paddd	%xmm7,%xmm3
-.byte	15,56,204,229
-.byte	15,56,203,202
+	sha256msg1	%xmm5,%xmm4
+	sha256rnds2	%xmm2,%xmm1
 	movdqa	224-128(%rcx),%xmm0
 	paddd	%xmm6,%xmm0
-.byte	15,56,205,222
-.byte	15,56,203,209
+	sha256msg2	%xmm6,%xmm3
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
 	movdqa	%xmm3,%xmm7
-.byte	102,15,58,15,254,4
+	palignr	$4,%xmm6,%xmm7
 	nop
 	paddd	%xmm7,%xmm4
-.byte	15,56,204,238
-.byte	15,56,203,202
+	sha256msg1	%xmm6,%xmm5
+	sha256rnds2	%xmm2,%xmm1
 	movdqa	256-128(%rcx),%xmm0
 	paddd	%xmm3,%xmm0
-.byte	15,56,205,227
-.byte	15,56,203,209
+	sha256msg2	%xmm3,%xmm4
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
 	movdqa	%xmm4,%xmm7
-.byte	102,15,58,15,251,4
+	palignr	$4,%xmm3,%xmm7
 	nop
 	paddd	%xmm7,%xmm5
-.byte	15,56,204,243
-.byte	15,56,203,202
+	sha256msg1	%xmm3,%xmm6
+	sha256rnds2	%xmm2,%xmm1
 	movdqa	288-128(%rcx),%xmm0
 	paddd	%xmm4,%xmm0
-.byte	15,56,205,236
-.byte	15,56,203,209
+	sha256msg2	%xmm4,%xmm5
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
 	movdqa	%xmm5,%xmm7
-.byte	102,15,58,15,252,4
+	palignr	$4,%xmm4,%xmm7
 	nop
 	paddd	%xmm7,%xmm6
-.byte	15,56,204,220
-.byte	15,56,203,202
+	sha256msg1	%xmm4,%xmm3
+	sha256rnds2	%xmm2,%xmm1
 	movdqa	320-128(%rcx),%xmm0
 	paddd	%xmm5,%xmm0
-.byte	15,56,205,245
-.byte	15,56,203,209
+	sha256msg2	%xmm5,%xmm6
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
 	movdqa	%xmm6,%xmm7
-.byte	102,15,58,15,253,4
+	palignr	$4,%xmm5,%xmm7
 	nop
 	paddd	%xmm7,%xmm3
-.byte	15,56,204,229
-.byte	15,56,203,202
+	sha256msg1	%xmm5,%xmm4
+	sha256rnds2	%xmm2,%xmm1
 	movdqa	352-128(%rcx),%xmm0
 	paddd	%xmm6,%xmm0
-.byte	15,56,205,222
-.byte	15,56,203,209
+	sha256msg2	%xmm6,%xmm3
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
 	movdqa	%xmm3,%xmm7
-.byte	102,15,58,15,254,4
+	palignr	$4,%xmm6,%xmm7
 	nop
 	paddd	%xmm7,%xmm4
-.byte	15,56,204,238
-.byte	15,56,203,202
+	sha256msg1	%xmm6,%xmm5
+	sha256rnds2	%xmm2,%xmm1
 	movdqa	384-128(%rcx),%xmm0
 	paddd	%xmm3,%xmm0
-.byte	15,56,205,227
-.byte	15,56,203,209
+	sha256msg2	%xmm3,%xmm4
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
 	movdqa	%xmm4,%xmm7
-.byte	102,15,58,15,251,4
+	palignr	$4,%xmm3,%xmm7
 	nop
 	paddd	%xmm7,%xmm5
-.byte	15,56,204,243
-.byte	15,56,203,202
+	sha256msg1	%xmm3,%xmm6
+	sha256rnds2	%xmm2,%xmm1
 	movdqa	416-128(%rcx),%xmm0
 	paddd	%xmm4,%xmm0
-.byte	15,56,205,236
-.byte	15,56,203,209
+	sha256msg2	%xmm4,%xmm5
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
 	movdqa	%xmm5,%xmm7
-.byte	102,15,58,15,252,4
-.byte	15,56,203,202
+	palignr	$4,%xmm4,%xmm7
+	sha256rnds2	%xmm2,%xmm1
 	paddd	%xmm7,%xmm6
 
 	movdqa	448-128(%rcx),%xmm0
 	paddd	%xmm5,%xmm0
-.byte	15,56,203,209
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
-.byte	15,56,205,245
+	sha256msg2	%xmm5,%xmm6
 	movdqa	%xmm8,%xmm7
-.byte	15,56,203,202
+	sha256rnds2	%xmm2,%xmm1
 
 	movdqa	480-128(%rcx),%xmm0
 	paddd	%xmm6,%xmm0
 	nop
-.byte	15,56,203,209
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
 	decq	%rdx
 	nop
-.byte	15,56,203,202
+	sha256rnds2	%xmm2,%xmm1
 
 	paddd	%xmm10,%xmm2
 	paddd	%xmm9,%xmm1
@@ -1968,7 +1968,7 @@
 	pshufd	$0x1b,%xmm1,%xmm7
 	pshufd	$0xb1,%xmm1,%xmm1
 	punpckhqdq	%xmm2,%xmm1
-.byte	102,15,58,15,215,8
+	palignr	$8,%xmm7,%xmm2
 
 	movdqu	%xmm1,(%rdi)
 	movdqu	%xmm2,16(%rdi)
@@ -2024,16 +2024,16 @@
 	movdqu	0(%rsi),%xmm0
 	movdqu	16(%rsi),%xmm1
 	movdqu	32(%rsi),%xmm2
-.byte	102,15,56,0,199
+	pshufb	%xmm7,%xmm0
 	movdqu	48(%rsi),%xmm3
 	leaq	K256(%rip),%rbp
-.byte	102,15,56,0,207
+	pshufb	%xmm7,%xmm1
 	movdqa	0(%rbp),%xmm4
 	movdqa	32(%rbp),%xmm5
-.byte	102,15,56,0,215
+	pshufb	%xmm7,%xmm2
 	paddd	%xmm0,%xmm4
 	movdqa	64(%rbp),%xmm6
-.byte	102,15,56,0,223
+	pshufb	%xmm7,%xmm3
 	movdqa	96(%rbp),%xmm7
 	paddd	%xmm1,%xmm5
 	paddd	%xmm2,%xmm6
@@ -2061,10 +2061,10 @@
 	xorl	%r10d,%r12d
 	rorl	$5,%r13d
 	xorl	%eax,%r14d
-.byte	102,15,58,15,224,4
+	palignr	$4,%xmm0,%xmm4
 	andl	%r8d,%r12d
 	xorl	%r8d,%r13d
-.byte	102,15,58,15,250,4
+	palignr	$4,%xmm2,%xmm7
 	addl	0(%rsp),%r11d
 	movl	%eax,%r15d
 	xorl	%r10d,%r12d
@@ -2205,10 +2205,10 @@
 	xorl	%ecx,%r12d
 	rorl	$5,%r13d
 	xorl	%r8d,%r14d
-.byte	102,15,58,15,225,4
+	palignr	$4,%xmm1,%xmm4
 	andl	%eax,%r12d
 	xorl	%eax,%r13d
-.byte	102,15,58,15,251,4
+	palignr	$4,%xmm3,%xmm7
 	addl	16(%rsp),%edx
 	movl	%r8d,%r15d
 	xorl	%ecx,%r12d
@@ -2349,10 +2349,10 @@
 	xorl	%r10d,%r12d
 	rorl	$5,%r13d
 	xorl	%eax,%r14d
-.byte	102,15,58,15,226,4
+	palignr	$4,%xmm2,%xmm4
 	andl	%r8d,%r12d
 	xorl	%r8d,%r13d
-.byte	102,15,58,15,248,4
+	palignr	$4,%xmm0,%xmm7
 	addl	32(%rsp),%r11d
 	movl	%eax,%r15d
 	xorl	%r10d,%r12d
@@ -2493,10 +2493,10 @@
 	xorl	%ecx,%r12d
 	rorl	$5,%r13d
 	xorl	%r8d,%r14d
-.byte	102,15,58,15,227,4
+	palignr	$4,%xmm3,%xmm4
 	andl	%eax,%r12d
 	xorl	%eax,%r13d
-.byte	102,15,58,15,249,4
+	palignr	$4,%xmm1,%xmm7
 	addl	48(%rsp),%edx
 	movl	%r8d,%r15d
 	xorl	%ecx,%r12d
diff --git a/gen/bcm/sha256-x86_64-linux.S b/gen/bcm/sha256-x86_64-linux.S
index 8476b03..938f531 100644
--- a/gen/bcm/sha256-x86_64-linux.S
+++ b/gen/bcm/sha256-x86_64-linux.S
@@ -1780,7 +1780,7 @@
 	pshufd	$0xb1,%xmm1,%xmm1
 	pshufd	$0x1b,%xmm2,%xmm2
 	movdqa	%xmm7,%xmm8
-.byte	102,15,58,15,202,8
+	palignr	$8,%xmm2,%xmm1
 	punpcklqdq	%xmm0,%xmm2
 	jmp	.Loop_shaext
 
@@ -1789,176 +1789,176 @@
 	movdqu	(%rsi),%xmm3
 	movdqu	16(%rsi),%xmm4
 	movdqu	32(%rsi),%xmm5
-.byte	102,15,56,0,223
+	pshufb	%xmm7,%xmm3
 	movdqu	48(%rsi),%xmm6
 
 	movdqa	0-128(%rcx),%xmm0
 	paddd	%xmm3,%xmm0
-.byte	102,15,56,0,231
+	pshufb	%xmm7,%xmm4
 	movdqa	%xmm2,%xmm10
-.byte	15,56,203,209
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
 	nop
 	movdqa	%xmm1,%xmm9
-.byte	15,56,203,202
+	sha256rnds2	%xmm2,%xmm1
 
 	movdqa	32-128(%rcx),%xmm0
 	paddd	%xmm4,%xmm0
-.byte	102,15,56,0,239
-.byte	15,56,203,209
+	pshufb	%xmm7,%xmm5
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
 	leaq	64(%rsi),%rsi
-.byte	15,56,204,220
-.byte	15,56,203,202
+	sha256msg1	%xmm4,%xmm3
+	sha256rnds2	%xmm2,%xmm1
 
 	movdqa	64-128(%rcx),%xmm0
 	paddd	%xmm5,%xmm0
-.byte	102,15,56,0,247
-.byte	15,56,203,209
+	pshufb	%xmm7,%xmm6
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
 	movdqa	%xmm6,%xmm7
-.byte	102,15,58,15,253,4
+	palignr	$4,%xmm5,%xmm7
 	nop
 	paddd	%xmm7,%xmm3
-.byte	15,56,204,229
-.byte	15,56,203,202
+	sha256msg1	%xmm5,%xmm4
+	sha256rnds2	%xmm2,%xmm1
 
 	movdqa	96-128(%rcx),%xmm0
 	paddd	%xmm6,%xmm0
-.byte	15,56,205,222
-.byte	15,56,203,209
+	sha256msg2	%xmm6,%xmm3
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
 	movdqa	%xmm3,%xmm7
-.byte	102,15,58,15,254,4
+	palignr	$4,%xmm6,%xmm7
 	nop
 	paddd	%xmm7,%xmm4
-.byte	15,56,204,238
-.byte	15,56,203,202
+	sha256msg1	%xmm6,%xmm5
+	sha256rnds2	%xmm2,%xmm1
 	movdqa	128-128(%rcx),%xmm0
 	paddd	%xmm3,%xmm0
-.byte	15,56,205,227
-.byte	15,56,203,209
+	sha256msg2	%xmm3,%xmm4
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
 	movdqa	%xmm4,%xmm7
-.byte	102,15,58,15,251,4
+	palignr	$4,%xmm3,%xmm7
 	nop
 	paddd	%xmm7,%xmm5
-.byte	15,56,204,243
-.byte	15,56,203,202
+	sha256msg1	%xmm3,%xmm6
+	sha256rnds2	%xmm2,%xmm1
 	movdqa	160-128(%rcx),%xmm0
 	paddd	%xmm4,%xmm0
-.byte	15,56,205,236
-.byte	15,56,203,209
+	sha256msg2	%xmm4,%xmm5
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
 	movdqa	%xmm5,%xmm7
-.byte	102,15,58,15,252,4
+	palignr	$4,%xmm4,%xmm7
 	nop
 	paddd	%xmm7,%xmm6
-.byte	15,56,204,220
-.byte	15,56,203,202
+	sha256msg1	%xmm4,%xmm3
+	sha256rnds2	%xmm2,%xmm1
 	movdqa	192-128(%rcx),%xmm0
 	paddd	%xmm5,%xmm0
-.byte	15,56,205,245
-.byte	15,56,203,209
+	sha256msg2	%xmm5,%xmm6
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
 	movdqa	%xmm6,%xmm7
-.byte	102,15,58,15,253,4
+	palignr	$4,%xmm5,%xmm7
 	nop
 	paddd	%xmm7,%xmm3
-.byte	15,56,204,229
-.byte	15,56,203,202
+	sha256msg1	%xmm5,%xmm4
+	sha256rnds2	%xmm2,%xmm1
 	movdqa	224-128(%rcx),%xmm0
 	paddd	%xmm6,%xmm0
-.byte	15,56,205,222
-.byte	15,56,203,209
+	sha256msg2	%xmm6,%xmm3
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
 	movdqa	%xmm3,%xmm7
-.byte	102,15,58,15,254,4
+	palignr	$4,%xmm6,%xmm7
 	nop
 	paddd	%xmm7,%xmm4
-.byte	15,56,204,238
-.byte	15,56,203,202
+	sha256msg1	%xmm6,%xmm5
+	sha256rnds2	%xmm2,%xmm1
 	movdqa	256-128(%rcx),%xmm0
 	paddd	%xmm3,%xmm0
-.byte	15,56,205,227
-.byte	15,56,203,209
+	sha256msg2	%xmm3,%xmm4
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
 	movdqa	%xmm4,%xmm7
-.byte	102,15,58,15,251,4
+	palignr	$4,%xmm3,%xmm7
 	nop
 	paddd	%xmm7,%xmm5
-.byte	15,56,204,243
-.byte	15,56,203,202
+	sha256msg1	%xmm3,%xmm6
+	sha256rnds2	%xmm2,%xmm1
 	movdqa	288-128(%rcx),%xmm0
 	paddd	%xmm4,%xmm0
-.byte	15,56,205,236
-.byte	15,56,203,209
+	sha256msg2	%xmm4,%xmm5
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
 	movdqa	%xmm5,%xmm7
-.byte	102,15,58,15,252,4
+	palignr	$4,%xmm4,%xmm7
 	nop
 	paddd	%xmm7,%xmm6
-.byte	15,56,204,220
-.byte	15,56,203,202
+	sha256msg1	%xmm4,%xmm3
+	sha256rnds2	%xmm2,%xmm1
 	movdqa	320-128(%rcx),%xmm0
 	paddd	%xmm5,%xmm0
-.byte	15,56,205,245
-.byte	15,56,203,209
+	sha256msg2	%xmm5,%xmm6
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
 	movdqa	%xmm6,%xmm7
-.byte	102,15,58,15,253,4
+	palignr	$4,%xmm5,%xmm7
 	nop
 	paddd	%xmm7,%xmm3
-.byte	15,56,204,229
-.byte	15,56,203,202
+	sha256msg1	%xmm5,%xmm4
+	sha256rnds2	%xmm2,%xmm1
 	movdqa	352-128(%rcx),%xmm0
 	paddd	%xmm6,%xmm0
-.byte	15,56,205,222
-.byte	15,56,203,209
+	sha256msg2	%xmm6,%xmm3
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
 	movdqa	%xmm3,%xmm7
-.byte	102,15,58,15,254,4
+	palignr	$4,%xmm6,%xmm7
 	nop
 	paddd	%xmm7,%xmm4
-.byte	15,56,204,238
-.byte	15,56,203,202
+	sha256msg1	%xmm6,%xmm5
+	sha256rnds2	%xmm2,%xmm1
 	movdqa	384-128(%rcx),%xmm0
 	paddd	%xmm3,%xmm0
-.byte	15,56,205,227
-.byte	15,56,203,209
+	sha256msg2	%xmm3,%xmm4
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
 	movdqa	%xmm4,%xmm7
-.byte	102,15,58,15,251,4
+	palignr	$4,%xmm3,%xmm7
 	nop
 	paddd	%xmm7,%xmm5
-.byte	15,56,204,243
-.byte	15,56,203,202
+	sha256msg1	%xmm3,%xmm6
+	sha256rnds2	%xmm2,%xmm1
 	movdqa	416-128(%rcx),%xmm0
 	paddd	%xmm4,%xmm0
-.byte	15,56,205,236
-.byte	15,56,203,209
+	sha256msg2	%xmm4,%xmm5
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
 	movdqa	%xmm5,%xmm7
-.byte	102,15,58,15,252,4
-.byte	15,56,203,202
+	palignr	$4,%xmm4,%xmm7
+	sha256rnds2	%xmm2,%xmm1
 	paddd	%xmm7,%xmm6
 
 	movdqa	448-128(%rcx),%xmm0
 	paddd	%xmm5,%xmm0
-.byte	15,56,203,209
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
-.byte	15,56,205,245
+	sha256msg2	%xmm5,%xmm6
 	movdqa	%xmm8,%xmm7
-.byte	15,56,203,202
+	sha256rnds2	%xmm2,%xmm1
 
 	movdqa	480-128(%rcx),%xmm0
 	paddd	%xmm6,%xmm0
 	nop
-.byte	15,56,203,209
+	sha256rnds2	%xmm1,%xmm2
 	pshufd	$0x0e,%xmm0,%xmm0
 	decq	%rdx
 	nop
-.byte	15,56,203,202
+	sha256rnds2	%xmm2,%xmm1
 
 	paddd	%xmm10,%xmm2
 	paddd	%xmm9,%xmm1
@@ -1968,7 +1968,7 @@
 	pshufd	$0x1b,%xmm1,%xmm7
 	pshufd	$0xb1,%xmm1,%xmm1
 	punpckhqdq	%xmm2,%xmm1
-.byte	102,15,58,15,215,8
+	palignr	$8,%xmm7,%xmm2
 
 	movdqu	%xmm1,(%rdi)
 	movdqu	%xmm2,16(%rdi)
@@ -2024,16 +2024,16 @@
 	movdqu	0(%rsi),%xmm0
 	movdqu	16(%rsi),%xmm1
 	movdqu	32(%rsi),%xmm2
-.byte	102,15,56,0,199
+	pshufb	%xmm7,%xmm0
 	movdqu	48(%rsi),%xmm3
 	leaq	K256(%rip),%rbp
-.byte	102,15,56,0,207
+	pshufb	%xmm7,%xmm1
 	movdqa	0(%rbp),%xmm4
 	movdqa	32(%rbp),%xmm5
-.byte	102,15,56,0,215
+	pshufb	%xmm7,%xmm2
 	paddd	%xmm0,%xmm4
 	movdqa	64(%rbp),%xmm6
-.byte	102,15,56,0,223
+	pshufb	%xmm7,%xmm3
 	movdqa	96(%rbp),%xmm7
 	paddd	%xmm1,%xmm5
 	paddd	%xmm2,%xmm6
@@ -2061,10 +2061,10 @@
 	xorl	%r10d,%r12d
 	rorl	$5,%r13d
 	xorl	%eax,%r14d
-.byte	102,15,58,15,224,4
+	palignr	$4,%xmm0,%xmm4
 	andl	%r8d,%r12d
 	xorl	%r8d,%r13d
-.byte	102,15,58,15,250,4
+	palignr	$4,%xmm2,%xmm7
 	addl	0(%rsp),%r11d
 	movl	%eax,%r15d
 	xorl	%r10d,%r12d
@@ -2205,10 +2205,10 @@
 	xorl	%ecx,%r12d
 	rorl	$5,%r13d
 	xorl	%r8d,%r14d
-.byte	102,15,58,15,225,4
+	palignr	$4,%xmm1,%xmm4
 	andl	%eax,%r12d
 	xorl	%eax,%r13d
-.byte	102,15,58,15,251,4
+	palignr	$4,%xmm3,%xmm7
 	addl	16(%rsp),%edx
 	movl	%r8d,%r15d
 	xorl	%ecx,%r12d
@@ -2349,10 +2349,10 @@
 	xorl	%r10d,%r12d
 	rorl	$5,%r13d
 	xorl	%eax,%r14d
-.byte	102,15,58,15,226,4
+	palignr	$4,%xmm2,%xmm4
 	andl	%r8d,%r12d
 	xorl	%r8d,%r13d
-.byte	102,15,58,15,248,4
+	palignr	$4,%xmm0,%xmm7
 	addl	32(%rsp),%r11d
 	movl	%eax,%r15d
 	xorl	%r10d,%r12d
@@ -2493,10 +2493,10 @@
 	xorl	%ecx,%r12d
 	rorl	$5,%r13d
 	xorl	%r8d,%r14d
-.byte	102,15,58,15,227,4
+	palignr	$4,%xmm3,%xmm4
 	andl	%eax,%r12d
 	xorl	%eax,%r13d
-.byte	102,15,58,15,249,4
+	palignr	$4,%xmm1,%xmm7
 	addl	48(%rsp),%edx
 	movl	%r8d,%r15d
 	xorl	%ecx,%r12d
diff --git a/gen/bcm/sha256-x86_64-win.asm b/gen/bcm/sha256-x86_64-win.asm
index ada8dba..b720603 100644
--- a/gen/bcm/sha256-x86_64-win.asm
+++ b/gen/bcm/sha256-x86_64-win.asm
@@ -1818,7 +1818,7 @@
 	pshufd	xmm1,xmm1,0xb1
 	pshufd	xmm2,xmm2,0x1b
 	movdqa	xmm8,xmm7
-DB	102,15,58,15,202,8
+	palignr	xmm1,xmm2,8
 	punpcklqdq	xmm2,xmm0
 	jmp	NEAR $L$oop_shaext
 
@@ -1827,176 +1827,176 @@
 	movdqu	xmm3,XMMWORD[rsi]
 	movdqu	xmm4,XMMWORD[16+rsi]
 	movdqu	xmm5,XMMWORD[32+rsi]
-DB	102,15,56,0,223
+	pshufb	xmm3,xmm7
 	movdqu	xmm6,XMMWORD[48+rsi]
 
 	movdqa	xmm0,XMMWORD[((0-128))+rcx]
 	paddd	xmm0,xmm3
-DB	102,15,56,0,231
+	pshufb	xmm4,xmm7
 	movdqa	xmm10,xmm2
-	DB	15,56,203,209
+	sha256rnds2	xmm2,xmm1
 	pshufd	xmm0,xmm0,0x0e
 	nop
 	movdqa	xmm9,xmm1
-	DB	15,56,203,202
+	sha256rnds2	xmm1,xmm2
 
 	movdqa	xmm0,XMMWORD[((32-128))+rcx]
 	paddd	xmm0,xmm4
-DB	102,15,56,0,239
-	DB	15,56,203,209
+	pshufb	xmm5,xmm7
+	sha256rnds2	xmm2,xmm1
 	pshufd	xmm0,xmm0,0x0e
 	lea	rsi,[64+rsi]
-	DB	15,56,204,220
-	DB	15,56,203,202
+	sha256msg1	xmm3,xmm4
+	sha256rnds2	xmm1,xmm2
 
 	movdqa	xmm0,XMMWORD[((64-128))+rcx]
 	paddd	xmm0,xmm5
-DB	102,15,56,0,247
-	DB	15,56,203,209
+	pshufb	xmm6,xmm7
+	sha256rnds2	xmm2,xmm1
 	pshufd	xmm0,xmm0,0x0e
 	movdqa	xmm7,xmm6
-DB	102,15,58,15,253,4
+	palignr	xmm7,xmm5,4
 	nop
 	paddd	xmm3,xmm7
-	DB	15,56,204,229
-	DB	15,56,203,202
+	sha256msg1	xmm4,xmm5
+	sha256rnds2	xmm1,xmm2
 
 	movdqa	xmm0,XMMWORD[((96-128))+rcx]
 	paddd	xmm0,xmm6
-	DB	15,56,205,222
-	DB	15,56,203,209
+	sha256msg2	xmm3,xmm6
+	sha256rnds2	xmm2,xmm1
 	pshufd	xmm0,xmm0,0x0e
 	movdqa	xmm7,xmm3
-DB	102,15,58,15,254,4
+	palignr	xmm7,xmm6,4
 	nop
 	paddd	xmm4,xmm7
-	DB	15,56,204,238
-	DB	15,56,203,202
+	sha256msg1	xmm5,xmm6
+	sha256rnds2	xmm1,xmm2
 	movdqa	xmm0,XMMWORD[((128-128))+rcx]
 	paddd	xmm0,xmm3
-	DB	15,56,205,227
-	DB	15,56,203,209
+	sha256msg2	xmm4,xmm3
+	sha256rnds2	xmm2,xmm1
 	pshufd	xmm0,xmm0,0x0e
 	movdqa	xmm7,xmm4
-DB	102,15,58,15,251,4
+	palignr	xmm7,xmm3,4
 	nop
 	paddd	xmm5,xmm7
-	DB	15,56,204,243
-	DB	15,56,203,202
+	sha256msg1	xmm6,xmm3
+	sha256rnds2	xmm1,xmm2
 	movdqa	xmm0,XMMWORD[((160-128))+rcx]
 	paddd	xmm0,xmm4
-	DB	15,56,205,236
-	DB	15,56,203,209
+	sha256msg2	xmm5,xmm4
+	sha256rnds2	xmm2,xmm1
 	pshufd	xmm0,xmm0,0x0e
 	movdqa	xmm7,xmm5
-DB	102,15,58,15,252,4
+	palignr	xmm7,xmm4,4
 	nop
 	paddd	xmm6,xmm7
-	DB	15,56,204,220
-	DB	15,56,203,202
+	sha256msg1	xmm3,xmm4
+	sha256rnds2	xmm1,xmm2
 	movdqa	xmm0,XMMWORD[((192-128))+rcx]
 	paddd	xmm0,xmm5
-	DB	15,56,205,245
-	DB	15,56,203,209
+	sha256msg2	xmm6,xmm5
+	sha256rnds2	xmm2,xmm1
 	pshufd	xmm0,xmm0,0x0e
 	movdqa	xmm7,xmm6
-DB	102,15,58,15,253,4
+	palignr	xmm7,xmm5,4
 	nop
 	paddd	xmm3,xmm7
-	DB	15,56,204,229
-	DB	15,56,203,202
+	sha256msg1	xmm4,xmm5
+	sha256rnds2	xmm1,xmm2
 	movdqa	xmm0,XMMWORD[((224-128))+rcx]
 	paddd	xmm0,xmm6
-	DB	15,56,205,222
-	DB	15,56,203,209
+	sha256msg2	xmm3,xmm6
+	sha256rnds2	xmm2,xmm1
 	pshufd	xmm0,xmm0,0x0e
 	movdqa	xmm7,xmm3
-DB	102,15,58,15,254,4
+	palignr	xmm7,xmm6,4
 	nop
 	paddd	xmm4,xmm7
-	DB	15,56,204,238
-	DB	15,56,203,202
+	sha256msg1	xmm5,xmm6
+	sha256rnds2	xmm1,xmm2
 	movdqa	xmm0,XMMWORD[((256-128))+rcx]
 	paddd	xmm0,xmm3
-	DB	15,56,205,227
-	DB	15,56,203,209
+	sha256msg2	xmm4,xmm3
+	sha256rnds2	xmm2,xmm1
 	pshufd	xmm0,xmm0,0x0e
 	movdqa	xmm7,xmm4
-DB	102,15,58,15,251,4
+	palignr	xmm7,xmm3,4
 	nop
 	paddd	xmm5,xmm7
-	DB	15,56,204,243
-	DB	15,56,203,202
+	sha256msg1	xmm6,xmm3
+	sha256rnds2	xmm1,xmm2
 	movdqa	xmm0,XMMWORD[((288-128))+rcx]
 	paddd	xmm0,xmm4
-	DB	15,56,205,236
-	DB	15,56,203,209
+	sha256msg2	xmm5,xmm4
+	sha256rnds2	xmm2,xmm1
 	pshufd	xmm0,xmm0,0x0e
 	movdqa	xmm7,xmm5
-DB	102,15,58,15,252,4
+	palignr	xmm7,xmm4,4
 	nop
 	paddd	xmm6,xmm7
-	DB	15,56,204,220
-	DB	15,56,203,202
+	sha256msg1	xmm3,xmm4
+	sha256rnds2	xmm1,xmm2
 	movdqa	xmm0,XMMWORD[((320-128))+rcx]
 	paddd	xmm0,xmm5
-	DB	15,56,205,245
-	DB	15,56,203,209
+	sha256msg2	xmm6,xmm5
+	sha256rnds2	xmm2,xmm1
 	pshufd	xmm0,xmm0,0x0e
 	movdqa	xmm7,xmm6
-DB	102,15,58,15,253,4
+	palignr	xmm7,xmm5,4
 	nop
 	paddd	xmm3,xmm7
-	DB	15,56,204,229
-	DB	15,56,203,202
+	sha256msg1	xmm4,xmm5
+	sha256rnds2	xmm1,xmm2
 	movdqa	xmm0,XMMWORD[((352-128))+rcx]
 	paddd	xmm0,xmm6
-	DB	15,56,205,222
-	DB	15,56,203,209
+	sha256msg2	xmm3,xmm6
+	sha256rnds2	xmm2,xmm1
 	pshufd	xmm0,xmm0,0x0e
 	movdqa	xmm7,xmm3
-DB	102,15,58,15,254,4
+	palignr	xmm7,xmm6,4
 	nop
 	paddd	xmm4,xmm7
-	DB	15,56,204,238
-	DB	15,56,203,202
+	sha256msg1	xmm5,xmm6
+	sha256rnds2	xmm1,xmm2
 	movdqa	xmm0,XMMWORD[((384-128))+rcx]
 	paddd	xmm0,xmm3
-	DB	15,56,205,227
-	DB	15,56,203,209
+	sha256msg2	xmm4,xmm3
+	sha256rnds2	xmm2,xmm1
 	pshufd	xmm0,xmm0,0x0e
 	movdqa	xmm7,xmm4
-DB	102,15,58,15,251,4
+	palignr	xmm7,xmm3,4
 	nop
 	paddd	xmm5,xmm7
-	DB	15,56,204,243
-	DB	15,56,203,202
+	sha256msg1	xmm6,xmm3
+	sha256rnds2	xmm1,xmm2
 	movdqa	xmm0,XMMWORD[((416-128))+rcx]
 	paddd	xmm0,xmm4
-	DB	15,56,205,236
-	DB	15,56,203,209
+	sha256msg2	xmm5,xmm4
+	sha256rnds2	xmm2,xmm1
 	pshufd	xmm0,xmm0,0x0e
 	movdqa	xmm7,xmm5
-DB	102,15,58,15,252,4
-	DB	15,56,203,202
+	palignr	xmm7,xmm4,4
+	sha256rnds2	xmm1,xmm2
 	paddd	xmm6,xmm7
 
 	movdqa	xmm0,XMMWORD[((448-128))+rcx]
 	paddd	xmm0,xmm5
-	DB	15,56,203,209
+	sha256rnds2	xmm2,xmm1
 	pshufd	xmm0,xmm0,0x0e
-	DB	15,56,205,245
+	sha256msg2	xmm6,xmm5
 	movdqa	xmm7,xmm8
-	DB	15,56,203,202
+	sha256rnds2	xmm1,xmm2
 
 	movdqa	xmm0,XMMWORD[((480-128))+rcx]
 	paddd	xmm0,xmm6
 	nop
-	DB	15,56,203,209
+	sha256rnds2	xmm2,xmm1
 	pshufd	xmm0,xmm0,0x0e
 	dec	rdx
 	nop
-	DB	15,56,203,202
+	sha256rnds2	xmm1,xmm2
 
 	paddd	xmm2,xmm10
 	paddd	xmm1,xmm9
@@ -2006,7 +2006,7 @@
 	pshufd	xmm7,xmm1,0x1b
 	pshufd	xmm1,xmm1,0xb1
 	punpckhqdq	xmm1,xmm2
-DB	102,15,58,15,215,8
+	palignr	xmm2,xmm7,8
 
 	movdqu	XMMWORD[rdi],xmm1
 	movdqu	XMMWORD[16+rdi],xmm2
@@ -2083,16 +2083,16 @@
 	movdqu	xmm0,XMMWORD[rsi]
 	movdqu	xmm1,XMMWORD[16+rsi]
 	movdqu	xmm2,XMMWORD[32+rsi]
-DB	102,15,56,0,199
+	pshufb	xmm0,xmm7
 	movdqu	xmm3,XMMWORD[48+rsi]
 	lea	rbp,[K256]
-DB	102,15,56,0,207
+	pshufb	xmm1,xmm7
 	movdqa	xmm4,XMMWORD[rbp]
 	movdqa	xmm5,XMMWORD[32+rbp]
-DB	102,15,56,0,215
+	pshufb	xmm2,xmm7
 	paddd	xmm4,xmm0
 	movdqa	xmm6,XMMWORD[64+rbp]
-DB	102,15,56,0,223
+	pshufb	xmm3,xmm7
 	movdqa	xmm7,XMMWORD[96+rbp]
 	paddd	xmm5,xmm1
 	paddd	xmm6,xmm2
@@ -2120,10 +2120,10 @@
 	xor	r12d,r10d
 	ror	r13d,5
 	xor	r14d,eax
-DB	102,15,58,15,224,4
+	palignr	xmm4,xmm0,4
 	and	r12d,r8d
 	xor	r13d,r8d
-DB	102,15,58,15,250,4
+	palignr	xmm7,xmm2,4
 	add	r11d,DWORD[rsp]
 	mov	r15d,eax
 	xor	r12d,r10d
@@ -2264,10 +2264,10 @@
 	xor	r12d,ecx
 	ror	r13d,5
 	xor	r14d,r8d
-DB	102,15,58,15,225,4
+	palignr	xmm4,xmm1,4
 	and	r12d,eax
 	xor	r13d,eax
-DB	102,15,58,15,251,4
+	palignr	xmm7,xmm3,4
 	add	edx,DWORD[16+rsp]
 	mov	r15d,r8d
 	xor	r12d,ecx
@@ -2408,10 +2408,10 @@
 	xor	r12d,r10d
 	ror	r13d,5
 	xor	r14d,eax
-DB	102,15,58,15,226,4
+	palignr	xmm4,xmm2,4
 	and	r12d,r8d
 	xor	r13d,r8d
-DB	102,15,58,15,248,4
+	palignr	xmm7,xmm0,4
 	add	r11d,DWORD[32+rsp]
 	mov	r15d,eax
 	xor	r12d,r10d
@@ -2552,10 +2552,10 @@
 	xor	r12d,ecx
 	ror	r13d,5
 	xor	r14d,r8d
-DB	102,15,58,15,227,4
+	palignr	xmm4,xmm3,4
 	and	r12d,eax
 	xor	r13d,eax
-DB	102,15,58,15,249,4
+	palignr	xmm7,xmm1,4
 	add	edx,DWORD[48+rsp]
 	mov	r15d,r8d
 	xor	r12d,ecx
diff --git a/gen/bcm/vpaes-x86_64-apple.S b/gen/bcm/vpaes-x86_64-apple.S
index 5aea40f..bfcc030 100644
--- a/gen/bcm/vpaes-x86_64-apple.S
+++ b/gen/bcm/vpaes-x86_64-apple.S
@@ -34,9 +34,9 @@
 	movdqu	(%r9),%xmm5
 	psrld	$4,%xmm1
 	pand	%xmm9,%xmm0
-.byte	102,15,56,0,208
+	pshufb	%xmm0,%xmm2
 	movdqa	L$k_ipt+16(%rip),%xmm0
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	pxor	%xmm5,%xmm2
 	addq	$16,%r9
 	pxor	%xmm2,%xmm0
@@ -48,25 +48,25 @@
 
 	movdqa	%xmm13,%xmm4
 	movdqa	%xmm12,%xmm0
-.byte	102,15,56,0,226
-.byte	102,15,56,0,195
+	pshufb	%xmm2,%xmm4
+	pshufb	%xmm3,%xmm0
 	pxor	%xmm5,%xmm4
 	movdqa	%xmm15,%xmm5
 	pxor	%xmm4,%xmm0
 	movdqa	-64(%r11,%r10,1),%xmm1
-.byte	102,15,56,0,234
+	pshufb	%xmm2,%xmm5
 	movdqa	(%r11,%r10,1),%xmm4
 	movdqa	%xmm14,%xmm2
-.byte	102,15,56,0,211
+	pshufb	%xmm3,%xmm2
 	movdqa	%xmm0,%xmm3
 	pxor	%xmm5,%xmm2
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	addq	$16,%r9
 	pxor	%xmm2,%xmm0
-.byte	102,15,56,0,220
+	pshufb	%xmm4,%xmm3
 	addq	$16,%r11
 	pxor	%xmm0,%xmm3
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	andq	$0x30,%r11
 	subq	$1,%rax
 	pxor	%xmm3,%xmm0
@@ -78,19 +78,19 @@
 	pandn	%xmm0,%xmm1
 	psrld	$4,%xmm1
 	pand	%xmm9,%xmm0
-.byte	102,15,56,0,232
+	pshufb	%xmm0,%xmm5
 	movdqa	%xmm10,%xmm3
 	pxor	%xmm1,%xmm0
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	movdqa	%xmm10,%xmm4
 	pxor	%xmm5,%xmm3
-.byte	102,15,56,0,224
+	pshufb	%xmm0,%xmm4
 	movdqa	%xmm10,%xmm2
 	pxor	%xmm5,%xmm4
-.byte	102,15,56,0,211
+	pshufb	%xmm3,%xmm2
 	movdqa	%xmm10,%xmm3
 	pxor	%xmm0,%xmm2
-.byte	102,15,56,0,220
+	pshufb	%xmm4,%xmm3
 	movdqu	(%r9),%xmm5
 	pxor	%xmm1,%xmm3
 	jnz	L$enc_loop
@@ -98,12 +98,12 @@
 
 	movdqa	-96(%r10),%xmm4
 	movdqa	-80(%r10),%xmm0
-.byte	102,15,56,0,226
+	pshufb	%xmm2,%xmm4
 	pxor	%xmm5,%xmm4
-.byte	102,15,56,0,195
+	pshufb	%xmm3,%xmm0
 	movdqa	64(%r11,%r10,1),%xmm1
 	pxor	%xmm4,%xmm0
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	ret
 
 
@@ -156,12 +156,12 @@
 	psrld	$4,%xmm7
 	pand	%xmm9,%xmm0
 	pand	%xmm9,%xmm6
-.byte	102,15,56,0,208
-.byte	102,68,15,56,0,198
+	pshufb	%xmm0,%xmm2
+	pshufb	%xmm6,%xmm8
 	movdqa	L$k_ipt+16(%rip),%xmm0
 	movdqa	%xmm0,%xmm6
-.byte	102,15,56,0,193
-.byte	102,15,56,0,247
+	pshufb	%xmm1,%xmm0
+	pshufb	%xmm7,%xmm6
 	pxor	%xmm5,%xmm2
 	pxor	%xmm5,%xmm8
 	addq	$16,%r9
@@ -177,10 +177,10 @@
 	movdqa	L$k_sb1+16(%rip),%xmm0
 	movdqa	%xmm4,%xmm12
 	movdqa	%xmm0,%xmm6
-.byte	102,15,56,0,226
-.byte	102,69,15,56,0,224
-.byte	102,15,56,0,195
-.byte	102,65,15,56,0,243
+	pshufb	%xmm2,%xmm4
+	pshufb	%xmm8,%xmm12
+	pshufb	%xmm3,%xmm0
+	pshufb	%xmm11,%xmm6
 	pxor	%xmm5,%xmm4
 	pxor	%xmm5,%xmm12
 	movdqa	L$k_sb2(%rip),%xmm5
@@ -189,30 +189,30 @@
 	pxor	%xmm12,%xmm6
 	movdqa	-64(%r11,%r10,1),%xmm1
 
-.byte	102,15,56,0,234
-.byte	102,69,15,56,0,232
+	pshufb	%xmm2,%xmm5
+	pshufb	%xmm8,%xmm13
 	movdqa	(%r11,%r10,1),%xmm4
 
 	movdqa	L$k_sb2+16(%rip),%xmm2
 	movdqa	%xmm2,%xmm8
-.byte	102,15,56,0,211
-.byte	102,69,15,56,0,195
+	pshufb	%xmm3,%xmm2
+	pshufb	%xmm11,%xmm8
 	movdqa	%xmm0,%xmm3
 	movdqa	%xmm6,%xmm11
 	pxor	%xmm5,%xmm2
 	pxor	%xmm13,%xmm8
-.byte	102,15,56,0,193
-.byte	102,15,56,0,241
+	pshufb	%xmm1,%xmm0
+	pshufb	%xmm1,%xmm6
 	addq	$16,%r9
 	pxor	%xmm2,%xmm0
 	pxor	%xmm8,%xmm6
-.byte	102,15,56,0,220
-.byte	102,68,15,56,0,220
+	pshufb	%xmm4,%xmm3
+	pshufb	%xmm4,%xmm11
 	addq	$16,%r11
 	pxor	%xmm0,%xmm3
 	pxor	%xmm6,%xmm11
-.byte	102,15,56,0,193
-.byte	102,15,56,0,241
+	pshufb	%xmm1,%xmm0
+	pshufb	%xmm1,%xmm6
 	andq	$0x30,%r11
 	subq	$1,%rax
 	pxor	%xmm3,%xmm0
@@ -230,32 +230,32 @@
 	psrld	$4,%xmm7
 	pand	%xmm9,%xmm0
 	pand	%xmm9,%xmm6
-.byte	102,15,56,0,232
-.byte	102,68,15,56,0,238
+	pshufb	%xmm0,%xmm5
+	pshufb	%xmm6,%xmm13
 	movdqa	%xmm10,%xmm3
 	movdqa	%xmm10,%xmm11
 	pxor	%xmm1,%xmm0
 	pxor	%xmm7,%xmm6
-.byte	102,15,56,0,217
-.byte	102,68,15,56,0,223
+	pshufb	%xmm1,%xmm3
+	pshufb	%xmm7,%xmm11
 	movdqa	%xmm10,%xmm4
 	movdqa	%xmm10,%xmm12
 	pxor	%xmm5,%xmm3
 	pxor	%xmm13,%xmm11
-.byte	102,15,56,0,224
-.byte	102,68,15,56,0,230
+	pshufb	%xmm0,%xmm4
+	pshufb	%xmm6,%xmm12
 	movdqa	%xmm10,%xmm2
 	movdqa	%xmm10,%xmm8
 	pxor	%xmm5,%xmm4
 	pxor	%xmm13,%xmm12
-.byte	102,15,56,0,211
-.byte	102,69,15,56,0,195
+	pshufb	%xmm3,%xmm2
+	pshufb	%xmm11,%xmm8
 	movdqa	%xmm10,%xmm3
 	movdqa	%xmm10,%xmm11
 	pxor	%xmm0,%xmm2
 	pxor	%xmm6,%xmm8
-.byte	102,15,56,0,220
-.byte	102,69,15,56,0,220
+	pshufb	%xmm4,%xmm3
+	pshufb	%xmm12,%xmm11
 	movdqu	(%r9),%xmm5
 
 	pxor	%xmm1,%xmm3
@@ -267,18 +267,18 @@
 	movdqa	-80(%r10),%xmm0
 	movdqa	%xmm4,%xmm12
 	movdqa	%xmm0,%xmm6
-.byte	102,15,56,0,226
-.byte	102,69,15,56,0,224
+	pshufb	%xmm2,%xmm4
+	pshufb	%xmm8,%xmm12
 	pxor	%xmm5,%xmm4
 	pxor	%xmm5,%xmm12
-.byte	102,15,56,0,195
-.byte	102,65,15,56,0,243
+	pshufb	%xmm3,%xmm0
+	pshufb	%xmm11,%xmm6
 	movdqa	64(%r11,%r10,1),%xmm1
 
 	pxor	%xmm4,%xmm0
 	pxor	%xmm12,%xmm6
-.byte	102,15,56,0,193
-.byte	102,15,56,0,241
+	pshufb	%xmm1,%xmm0
+	pshufb	%xmm1,%xmm6
 	ret
 
 
@@ -302,11 +302,11 @@
 	movdqu	(%r9),%xmm5
 	shlq	$4,%r11
 	pand	%xmm9,%xmm0
-.byte	102,15,56,0,208
+	pshufb	%xmm0,%xmm2
 	movdqa	L$k_dipt+16(%rip),%xmm0
 	xorq	$0x30,%r11
 	leaq	L$k_dsbd(%rip),%r10
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	andq	$0x30,%r11
 	pxor	%xmm5,%xmm2
 	movdqa	L$k_mc_forward+48(%rip),%xmm5
@@ -322,35 +322,35 @@
 
 	movdqa	-32(%r10),%xmm4
 	movdqa	-16(%r10),%xmm1
-.byte	102,15,56,0,226
-.byte	102,15,56,0,203
+	pshufb	%xmm2,%xmm4
+	pshufb	%xmm3,%xmm1
 	pxor	%xmm4,%xmm0
 	movdqa	0(%r10),%xmm4
 	pxor	%xmm1,%xmm0
 	movdqa	16(%r10),%xmm1
 
-.byte	102,15,56,0,226
-.byte	102,15,56,0,197
-.byte	102,15,56,0,203
+	pshufb	%xmm2,%xmm4
+	pshufb	%xmm5,%xmm0
+	pshufb	%xmm3,%xmm1
 	pxor	%xmm4,%xmm0
 	movdqa	32(%r10),%xmm4
 	pxor	%xmm1,%xmm0
 	movdqa	48(%r10),%xmm1
 
-.byte	102,15,56,0,226
-.byte	102,15,56,0,197
-.byte	102,15,56,0,203
+	pshufb	%xmm2,%xmm4
+	pshufb	%xmm5,%xmm0
+	pshufb	%xmm3,%xmm1
 	pxor	%xmm4,%xmm0
 	movdqa	64(%r10),%xmm4
 	pxor	%xmm1,%xmm0
 	movdqa	80(%r10),%xmm1
 
-.byte	102,15,56,0,226
-.byte	102,15,56,0,197
-.byte	102,15,56,0,203
+	pshufb	%xmm2,%xmm4
+	pshufb	%xmm5,%xmm0
+	pshufb	%xmm3,%xmm1
 	pxor	%xmm4,%xmm0
 	addq	$16,%r9
-.byte	102,15,58,15,237,12
+	palignr	$12,%xmm5,%xmm5
 	pxor	%xmm1,%xmm0
 	subq	$1,%rax
 
@@ -361,32 +361,32 @@
 	movdqa	%xmm11,%xmm2
 	psrld	$4,%xmm1
 	pand	%xmm9,%xmm0
-.byte	102,15,56,0,208
+	pshufb	%xmm0,%xmm2
 	movdqa	%xmm10,%xmm3
 	pxor	%xmm1,%xmm0
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	movdqa	%xmm10,%xmm4
 	pxor	%xmm2,%xmm3
-.byte	102,15,56,0,224
+	pshufb	%xmm0,%xmm4
 	pxor	%xmm2,%xmm4
 	movdqa	%xmm10,%xmm2
-.byte	102,15,56,0,211
+	pshufb	%xmm3,%xmm2
 	movdqa	%xmm10,%xmm3
 	pxor	%xmm0,%xmm2
-.byte	102,15,56,0,220
+	pshufb	%xmm4,%xmm3
 	movdqu	(%r9),%xmm0
 	pxor	%xmm1,%xmm3
 	jnz	L$dec_loop
 
 
 	movdqa	96(%r10),%xmm4
-.byte	102,15,56,0,226
+	pshufb	%xmm2,%xmm4
 	pxor	%xmm0,%xmm4
 	movdqa	112(%r10),%xmm0
 	movdqa	-352(%r11),%xmm2
-.byte	102,15,56,0,195
+	pshufb	%xmm3,%xmm0
 	pxor	%xmm4,%xmm0
-.byte	102,15,56,0,194
+	pshufb	%xmm2,%xmm0
 	ret
 
 
@@ -426,7 +426,7 @@
 L$schedule_am_decrypting:
 
 	movdqa	(%r8,%r10,1),%xmm1
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	movdqu	%xmm3,(%rdx)
 	xorq	$0x30,%r8
 
@@ -480,7 +480,7 @@
 
 L$oop_schedule_192:
 	call	_vpaes_schedule_round
-.byte	102,15,58,15,198,8
+	palignr	$8,%xmm6,%xmm0
 	call	_vpaes_schedule_mangle
 	call	_vpaes_schedule_192_smear
 	call	_vpaes_schedule_mangle
@@ -546,7 +546,7 @@
 
 
 	movdqa	(%r8,%r10,1),%xmm1
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	leaq	L$k_opt(%rip),%r11
 	addq	$32,%rdx
 
@@ -622,13 +622,13 @@
 
 
 	pxor	%xmm1,%xmm1
-.byte	102,65,15,58,15,200,15
-.byte	102,69,15,58,15,192,15
+	palignr	$15,%xmm8,%xmm1
+	palignr	$15,%xmm8,%xmm8
 	pxor	%xmm1,%xmm7
 
 
 	pshufd	$0xFF,%xmm0,%xmm0
-.byte	102,15,58,15,192,1
+	palignr	$1,%xmm0,%xmm0
 
 
 
@@ -649,24 +649,24 @@
 	psrld	$4,%xmm1
 	pand	%xmm9,%xmm0
 	movdqa	%xmm11,%xmm2
-.byte	102,15,56,0,208
+	pshufb	%xmm0,%xmm2
 	pxor	%xmm1,%xmm0
 	movdqa	%xmm10,%xmm3
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	pxor	%xmm2,%xmm3
 	movdqa	%xmm10,%xmm4
-.byte	102,15,56,0,224
+	pshufb	%xmm0,%xmm4
 	pxor	%xmm2,%xmm4
 	movdqa	%xmm10,%xmm2
-.byte	102,15,56,0,211
+	pshufb	%xmm3,%xmm2
 	pxor	%xmm0,%xmm2
 	movdqa	%xmm10,%xmm3
-.byte	102,15,56,0,220
+	pshufb	%xmm4,%xmm3
 	pxor	%xmm1,%xmm3
 	movdqa	%xmm13,%xmm4
-.byte	102,15,56,0,226
+	pshufb	%xmm2,%xmm4
 	movdqa	%xmm12,%xmm0
-.byte	102,15,56,0,195
+	pshufb	%xmm3,%xmm0
 	pxor	%xmm4,%xmm0
 
 
@@ -694,9 +694,9 @@
 	psrld	$4,%xmm1
 	pand	%xmm9,%xmm0
 	movdqa	(%r11),%xmm2
-.byte	102,15,56,0,208
+	pshufb	%xmm0,%xmm2
 	movdqa	16(%r11),%xmm0
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	pxor	%xmm2,%xmm0
 	ret
 
@@ -737,11 +737,11 @@
 
 	addq	$16,%rdx
 	pxor	L$k_s63(%rip),%xmm4
-.byte	102,15,56,0,229
+	pshufb	%xmm5,%xmm4
 	movdqa	%xmm4,%xmm3
-.byte	102,15,56,0,229
+	pshufb	%xmm5,%xmm4
 	pxor	%xmm4,%xmm3
-.byte	102,15,56,0,229
+	pshufb	%xmm5,%xmm4
 	pxor	%xmm4,%xmm3
 
 	jmp	L$schedule_mangle_both
@@ -755,40 +755,40 @@
 	pand	%xmm9,%xmm4
 
 	movdqa	0(%r11),%xmm2
-.byte	102,15,56,0,212
+	pshufb	%xmm4,%xmm2
 	movdqa	16(%r11),%xmm3
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	pxor	%xmm2,%xmm3
-.byte	102,15,56,0,221
+	pshufb	%xmm5,%xmm3
 
 	movdqa	32(%r11),%xmm2
-.byte	102,15,56,0,212
+	pshufb	%xmm4,%xmm2
 	pxor	%xmm3,%xmm2
 	movdqa	48(%r11),%xmm3
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	pxor	%xmm2,%xmm3
-.byte	102,15,56,0,221
+	pshufb	%xmm5,%xmm3
 
 	movdqa	64(%r11),%xmm2
-.byte	102,15,56,0,212
+	pshufb	%xmm4,%xmm2
 	pxor	%xmm3,%xmm2
 	movdqa	80(%r11),%xmm3
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	pxor	%xmm2,%xmm3
-.byte	102,15,56,0,221
+	pshufb	%xmm5,%xmm3
 
 	movdqa	96(%r11),%xmm2
-.byte	102,15,56,0,212
+	pshufb	%xmm4,%xmm2
 	pxor	%xmm3,%xmm2
 	movdqa	112(%r11),%xmm3
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	pxor	%xmm2,%xmm3
 
 	addq	$-16,%rdx
 
 L$schedule_mangle_both:
 	movdqa	(%r8,%r10,1),%xmm1
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	addq	$-16,%r8
 	andq	$0x30,%r8
 	movdqu	%xmm3,(%rdx)
@@ -969,8 +969,8 @@
 	movdqa	L$rev_ctr(%rip),%xmm1
 	movdqa	%xmm14,%xmm0
 	movdqa	%xmm15,%xmm6
-.byte	102,15,56,0,193
-.byte	102,15,56,0,241
+	pshufb	%xmm1,%xmm0
+	pshufb	%xmm1,%xmm6
 	call	_vpaes_encrypt_core_2x
 	movdqu	(%rdi),%xmm1
 	movdqu	16(%rdi),%xmm2
diff --git a/gen/bcm/vpaes-x86_64-linux.S b/gen/bcm/vpaes-x86_64-linux.S
index 019c638..e788464 100644
--- a/gen/bcm/vpaes-x86_64-linux.S
+++ b/gen/bcm/vpaes-x86_64-linux.S
@@ -34,9 +34,9 @@
 	movdqu	(%r9),%xmm5
 	psrld	$4,%xmm1
 	pand	%xmm9,%xmm0
-.byte	102,15,56,0,208
+	pshufb	%xmm0,%xmm2
 	movdqa	.Lk_ipt+16(%rip),%xmm0
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	pxor	%xmm5,%xmm2
 	addq	$16,%r9
 	pxor	%xmm2,%xmm0
@@ -48,25 +48,25 @@
 
 	movdqa	%xmm13,%xmm4
 	movdqa	%xmm12,%xmm0
-.byte	102,15,56,0,226
-.byte	102,15,56,0,195
+	pshufb	%xmm2,%xmm4
+	pshufb	%xmm3,%xmm0
 	pxor	%xmm5,%xmm4
 	movdqa	%xmm15,%xmm5
 	pxor	%xmm4,%xmm0
 	movdqa	-64(%r11,%r10,1),%xmm1
-.byte	102,15,56,0,234
+	pshufb	%xmm2,%xmm5
 	movdqa	(%r11,%r10,1),%xmm4
 	movdqa	%xmm14,%xmm2
-.byte	102,15,56,0,211
+	pshufb	%xmm3,%xmm2
 	movdqa	%xmm0,%xmm3
 	pxor	%xmm5,%xmm2
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	addq	$16,%r9
 	pxor	%xmm2,%xmm0
-.byte	102,15,56,0,220
+	pshufb	%xmm4,%xmm3
 	addq	$16,%r11
 	pxor	%xmm0,%xmm3
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	andq	$0x30,%r11
 	subq	$1,%rax
 	pxor	%xmm3,%xmm0
@@ -78,19 +78,19 @@
 	pandn	%xmm0,%xmm1
 	psrld	$4,%xmm1
 	pand	%xmm9,%xmm0
-.byte	102,15,56,0,232
+	pshufb	%xmm0,%xmm5
 	movdqa	%xmm10,%xmm3
 	pxor	%xmm1,%xmm0
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	movdqa	%xmm10,%xmm4
 	pxor	%xmm5,%xmm3
-.byte	102,15,56,0,224
+	pshufb	%xmm0,%xmm4
 	movdqa	%xmm10,%xmm2
 	pxor	%xmm5,%xmm4
-.byte	102,15,56,0,211
+	pshufb	%xmm3,%xmm2
 	movdqa	%xmm10,%xmm3
 	pxor	%xmm0,%xmm2
-.byte	102,15,56,0,220
+	pshufb	%xmm4,%xmm3
 	movdqu	(%r9),%xmm5
 	pxor	%xmm1,%xmm3
 	jnz	.Lenc_loop
@@ -98,12 +98,12 @@
 
 	movdqa	-96(%r10),%xmm4
 	movdqa	-80(%r10),%xmm0
-.byte	102,15,56,0,226
+	pshufb	%xmm2,%xmm4
 	pxor	%xmm5,%xmm4
-.byte	102,15,56,0,195
+	pshufb	%xmm3,%xmm0
 	movdqa	64(%r11,%r10,1),%xmm1
 	pxor	%xmm4,%xmm0
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	ret
 .cfi_endproc	
 .size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
@@ -156,12 +156,12 @@
 	psrld	$4,%xmm7
 	pand	%xmm9,%xmm0
 	pand	%xmm9,%xmm6
-.byte	102,15,56,0,208
-.byte	102,68,15,56,0,198
+	pshufb	%xmm0,%xmm2
+	pshufb	%xmm6,%xmm8
 	movdqa	.Lk_ipt+16(%rip),%xmm0
 	movdqa	%xmm0,%xmm6
-.byte	102,15,56,0,193
-.byte	102,15,56,0,247
+	pshufb	%xmm1,%xmm0
+	pshufb	%xmm7,%xmm6
 	pxor	%xmm5,%xmm2
 	pxor	%xmm5,%xmm8
 	addq	$16,%r9
@@ -177,10 +177,10 @@
 	movdqa	.Lk_sb1+16(%rip),%xmm0
 	movdqa	%xmm4,%xmm12
 	movdqa	%xmm0,%xmm6
-.byte	102,15,56,0,226
-.byte	102,69,15,56,0,224
-.byte	102,15,56,0,195
-.byte	102,65,15,56,0,243
+	pshufb	%xmm2,%xmm4
+	pshufb	%xmm8,%xmm12
+	pshufb	%xmm3,%xmm0
+	pshufb	%xmm11,%xmm6
 	pxor	%xmm5,%xmm4
 	pxor	%xmm5,%xmm12
 	movdqa	.Lk_sb2(%rip),%xmm5
@@ -189,30 +189,30 @@
 	pxor	%xmm12,%xmm6
 	movdqa	-64(%r11,%r10,1),%xmm1
 
-.byte	102,15,56,0,234
-.byte	102,69,15,56,0,232
+	pshufb	%xmm2,%xmm5
+	pshufb	%xmm8,%xmm13
 	movdqa	(%r11,%r10,1),%xmm4
 
 	movdqa	.Lk_sb2+16(%rip),%xmm2
 	movdqa	%xmm2,%xmm8
-.byte	102,15,56,0,211
-.byte	102,69,15,56,0,195
+	pshufb	%xmm3,%xmm2
+	pshufb	%xmm11,%xmm8
 	movdqa	%xmm0,%xmm3
 	movdqa	%xmm6,%xmm11
 	pxor	%xmm5,%xmm2
 	pxor	%xmm13,%xmm8
-.byte	102,15,56,0,193
-.byte	102,15,56,0,241
+	pshufb	%xmm1,%xmm0
+	pshufb	%xmm1,%xmm6
 	addq	$16,%r9
 	pxor	%xmm2,%xmm0
 	pxor	%xmm8,%xmm6
-.byte	102,15,56,0,220
-.byte	102,68,15,56,0,220
+	pshufb	%xmm4,%xmm3
+	pshufb	%xmm4,%xmm11
 	addq	$16,%r11
 	pxor	%xmm0,%xmm3
 	pxor	%xmm6,%xmm11
-.byte	102,15,56,0,193
-.byte	102,15,56,0,241
+	pshufb	%xmm1,%xmm0
+	pshufb	%xmm1,%xmm6
 	andq	$0x30,%r11
 	subq	$1,%rax
 	pxor	%xmm3,%xmm0
@@ -230,32 +230,32 @@
 	psrld	$4,%xmm7
 	pand	%xmm9,%xmm0
 	pand	%xmm9,%xmm6
-.byte	102,15,56,0,232
-.byte	102,68,15,56,0,238
+	pshufb	%xmm0,%xmm5
+	pshufb	%xmm6,%xmm13
 	movdqa	%xmm10,%xmm3
 	movdqa	%xmm10,%xmm11
 	pxor	%xmm1,%xmm0
 	pxor	%xmm7,%xmm6
-.byte	102,15,56,0,217
-.byte	102,68,15,56,0,223
+	pshufb	%xmm1,%xmm3
+	pshufb	%xmm7,%xmm11
 	movdqa	%xmm10,%xmm4
 	movdqa	%xmm10,%xmm12
 	pxor	%xmm5,%xmm3
 	pxor	%xmm13,%xmm11
-.byte	102,15,56,0,224
-.byte	102,68,15,56,0,230
+	pshufb	%xmm0,%xmm4
+	pshufb	%xmm6,%xmm12
 	movdqa	%xmm10,%xmm2
 	movdqa	%xmm10,%xmm8
 	pxor	%xmm5,%xmm4
 	pxor	%xmm13,%xmm12
-.byte	102,15,56,0,211
-.byte	102,69,15,56,0,195
+	pshufb	%xmm3,%xmm2
+	pshufb	%xmm11,%xmm8
 	movdqa	%xmm10,%xmm3
 	movdqa	%xmm10,%xmm11
 	pxor	%xmm0,%xmm2
 	pxor	%xmm6,%xmm8
-.byte	102,15,56,0,220
-.byte	102,69,15,56,0,220
+	pshufb	%xmm4,%xmm3
+	pshufb	%xmm12,%xmm11
 	movdqu	(%r9),%xmm5
 
 	pxor	%xmm1,%xmm3
@@ -267,18 +267,18 @@
 	movdqa	-80(%r10),%xmm0
 	movdqa	%xmm4,%xmm12
 	movdqa	%xmm0,%xmm6
-.byte	102,15,56,0,226
-.byte	102,69,15,56,0,224
+	pshufb	%xmm2,%xmm4
+	pshufb	%xmm8,%xmm12
 	pxor	%xmm5,%xmm4
 	pxor	%xmm5,%xmm12
-.byte	102,15,56,0,195
-.byte	102,65,15,56,0,243
+	pshufb	%xmm3,%xmm0
+	pshufb	%xmm11,%xmm6
 	movdqa	64(%r11,%r10,1),%xmm1
 
 	pxor	%xmm4,%xmm0
 	pxor	%xmm12,%xmm6
-.byte	102,15,56,0,193
-.byte	102,15,56,0,241
+	pshufb	%xmm1,%xmm0
+	pshufb	%xmm1,%xmm6
 	ret
 .cfi_endproc	
 .size	_vpaes_encrypt_core_2x,.-_vpaes_encrypt_core_2x
@@ -302,11 +302,11 @@
 	movdqu	(%r9),%xmm5
 	shlq	$4,%r11
 	pand	%xmm9,%xmm0
-.byte	102,15,56,0,208
+	pshufb	%xmm0,%xmm2
 	movdqa	.Lk_dipt+16(%rip),%xmm0
 	xorq	$0x30,%r11
 	leaq	.Lk_dsbd(%rip),%r10
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	andq	$0x30,%r11
 	pxor	%xmm5,%xmm2
 	movdqa	.Lk_mc_forward+48(%rip),%xmm5
@@ -322,35 +322,35 @@
 
 	movdqa	-32(%r10),%xmm4
 	movdqa	-16(%r10),%xmm1
-.byte	102,15,56,0,226
-.byte	102,15,56,0,203
+	pshufb	%xmm2,%xmm4
+	pshufb	%xmm3,%xmm1
 	pxor	%xmm4,%xmm0
 	movdqa	0(%r10),%xmm4
 	pxor	%xmm1,%xmm0
 	movdqa	16(%r10),%xmm1
 
-.byte	102,15,56,0,226
-.byte	102,15,56,0,197
-.byte	102,15,56,0,203
+	pshufb	%xmm2,%xmm4
+	pshufb	%xmm5,%xmm0
+	pshufb	%xmm3,%xmm1
 	pxor	%xmm4,%xmm0
 	movdqa	32(%r10),%xmm4
 	pxor	%xmm1,%xmm0
 	movdqa	48(%r10),%xmm1
 
-.byte	102,15,56,0,226
-.byte	102,15,56,0,197
-.byte	102,15,56,0,203
+	pshufb	%xmm2,%xmm4
+	pshufb	%xmm5,%xmm0
+	pshufb	%xmm3,%xmm1
 	pxor	%xmm4,%xmm0
 	movdqa	64(%r10),%xmm4
 	pxor	%xmm1,%xmm0
 	movdqa	80(%r10),%xmm1
 
-.byte	102,15,56,0,226
-.byte	102,15,56,0,197
-.byte	102,15,56,0,203
+	pshufb	%xmm2,%xmm4
+	pshufb	%xmm5,%xmm0
+	pshufb	%xmm3,%xmm1
 	pxor	%xmm4,%xmm0
 	addq	$16,%r9
-.byte	102,15,58,15,237,12
+	palignr	$12,%xmm5,%xmm5
 	pxor	%xmm1,%xmm0
 	subq	$1,%rax
 
@@ -361,32 +361,32 @@
 	movdqa	%xmm11,%xmm2
 	psrld	$4,%xmm1
 	pand	%xmm9,%xmm0
-.byte	102,15,56,0,208
+	pshufb	%xmm0,%xmm2
 	movdqa	%xmm10,%xmm3
 	pxor	%xmm1,%xmm0
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	movdqa	%xmm10,%xmm4
 	pxor	%xmm2,%xmm3
-.byte	102,15,56,0,224
+	pshufb	%xmm0,%xmm4
 	pxor	%xmm2,%xmm4
 	movdqa	%xmm10,%xmm2
-.byte	102,15,56,0,211
+	pshufb	%xmm3,%xmm2
 	movdqa	%xmm10,%xmm3
 	pxor	%xmm0,%xmm2
-.byte	102,15,56,0,220
+	pshufb	%xmm4,%xmm3
 	movdqu	(%r9),%xmm0
 	pxor	%xmm1,%xmm3
 	jnz	.Ldec_loop
 
 
 	movdqa	96(%r10),%xmm4
-.byte	102,15,56,0,226
+	pshufb	%xmm2,%xmm4
 	pxor	%xmm0,%xmm4
 	movdqa	112(%r10),%xmm0
 	movdqa	-352(%r11),%xmm2
-.byte	102,15,56,0,195
+	pshufb	%xmm3,%xmm0
 	pxor	%xmm4,%xmm0
-.byte	102,15,56,0,194
+	pshufb	%xmm2,%xmm0
 	ret
 .cfi_endproc	
 .size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
@@ -426,7 +426,7 @@
 .Lschedule_am_decrypting:
 
 	movdqa	(%r8,%r10,1),%xmm1
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	movdqu	%xmm3,(%rdx)
 	xorq	$0x30,%r8
 
@@ -480,7 +480,7 @@
 
 .Loop_schedule_192:
 	call	_vpaes_schedule_round
-.byte	102,15,58,15,198,8
+	palignr	$8,%xmm6,%xmm0
 	call	_vpaes_schedule_mangle
 	call	_vpaes_schedule_192_smear
 	call	_vpaes_schedule_mangle
@@ -546,7 +546,7 @@
 
 
 	movdqa	(%r8,%r10,1),%xmm1
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	leaq	.Lk_opt(%rip),%r11
 	addq	$32,%rdx
 
@@ -622,13 +622,13 @@
 .cfi_startproc	
 
 	pxor	%xmm1,%xmm1
-.byte	102,65,15,58,15,200,15
-.byte	102,69,15,58,15,192,15
+	palignr	$15,%xmm8,%xmm1
+	palignr	$15,%xmm8,%xmm8
 	pxor	%xmm1,%xmm7
 
 
 	pshufd	$0xFF,%xmm0,%xmm0
-.byte	102,15,58,15,192,1
+	palignr	$1,%xmm0,%xmm0
 
 
 
@@ -649,24 +649,24 @@
 	psrld	$4,%xmm1
 	pand	%xmm9,%xmm0
 	movdqa	%xmm11,%xmm2
-.byte	102,15,56,0,208
+	pshufb	%xmm0,%xmm2
 	pxor	%xmm1,%xmm0
 	movdqa	%xmm10,%xmm3
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	pxor	%xmm2,%xmm3
 	movdqa	%xmm10,%xmm4
-.byte	102,15,56,0,224
+	pshufb	%xmm0,%xmm4
 	pxor	%xmm2,%xmm4
 	movdqa	%xmm10,%xmm2
-.byte	102,15,56,0,211
+	pshufb	%xmm3,%xmm2
 	pxor	%xmm0,%xmm2
 	movdqa	%xmm10,%xmm3
-.byte	102,15,56,0,220
+	pshufb	%xmm4,%xmm3
 	pxor	%xmm1,%xmm3
 	movdqa	%xmm13,%xmm4
-.byte	102,15,56,0,226
+	pshufb	%xmm2,%xmm4
 	movdqa	%xmm12,%xmm0
-.byte	102,15,56,0,195
+	pshufb	%xmm3,%xmm0
 	pxor	%xmm4,%xmm0
 
 
@@ -694,9 +694,9 @@
 	psrld	$4,%xmm1
 	pand	%xmm9,%xmm0
 	movdqa	(%r11),%xmm2
-.byte	102,15,56,0,208
+	pshufb	%xmm0,%xmm2
 	movdqa	16(%r11),%xmm0
-.byte	102,15,56,0,193
+	pshufb	%xmm1,%xmm0
 	pxor	%xmm2,%xmm0
 	ret
 .cfi_endproc	
@@ -737,11 +737,11 @@
 
 	addq	$16,%rdx
 	pxor	.Lk_s63(%rip),%xmm4
-.byte	102,15,56,0,229
+	pshufb	%xmm5,%xmm4
 	movdqa	%xmm4,%xmm3
-.byte	102,15,56,0,229
+	pshufb	%xmm5,%xmm4
 	pxor	%xmm4,%xmm3
-.byte	102,15,56,0,229
+	pshufb	%xmm5,%xmm4
 	pxor	%xmm4,%xmm3
 
 	jmp	.Lschedule_mangle_both
@@ -755,40 +755,40 @@
 	pand	%xmm9,%xmm4
 
 	movdqa	0(%r11),%xmm2
-.byte	102,15,56,0,212
+	pshufb	%xmm4,%xmm2
 	movdqa	16(%r11),%xmm3
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	pxor	%xmm2,%xmm3
-.byte	102,15,56,0,221
+	pshufb	%xmm5,%xmm3
 
 	movdqa	32(%r11),%xmm2
-.byte	102,15,56,0,212
+	pshufb	%xmm4,%xmm2
 	pxor	%xmm3,%xmm2
 	movdqa	48(%r11),%xmm3
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	pxor	%xmm2,%xmm3
-.byte	102,15,56,0,221
+	pshufb	%xmm5,%xmm3
 
 	movdqa	64(%r11),%xmm2
-.byte	102,15,56,0,212
+	pshufb	%xmm4,%xmm2
 	pxor	%xmm3,%xmm2
 	movdqa	80(%r11),%xmm3
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	pxor	%xmm2,%xmm3
-.byte	102,15,56,0,221
+	pshufb	%xmm5,%xmm3
 
 	movdqa	96(%r11),%xmm2
-.byte	102,15,56,0,212
+	pshufb	%xmm4,%xmm2
 	pxor	%xmm3,%xmm2
 	movdqa	112(%r11),%xmm3
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	pxor	%xmm2,%xmm3
 
 	addq	$-16,%rdx
 
 .Lschedule_mangle_both:
 	movdqa	(%r8,%r10,1),%xmm1
-.byte	102,15,56,0,217
+	pshufb	%xmm1,%xmm3
 	addq	$-16,%r8
 	andq	$0x30,%r8
 	movdqu	%xmm3,(%rdx)
@@ -971,8 +971,8 @@
 	movdqa	.Lrev_ctr(%rip),%xmm1
 	movdqa	%xmm14,%xmm0
 	movdqa	%xmm15,%xmm6
-.byte	102,15,56,0,193
-.byte	102,15,56,0,241
+	pshufb	%xmm1,%xmm0
+	pshufb	%xmm1,%xmm6
 	call	_vpaes_encrypt_core_2x
 	movdqu	(%rdi),%xmm1
 	movdqu	16(%rdi),%xmm2
diff --git a/gen/bcm/vpaes-x86_64-win.asm b/gen/bcm/vpaes-x86_64-win.asm
index ddbfb12..e28ae47 100644
--- a/gen/bcm/vpaes-x86_64-win.asm
+++ b/gen/bcm/vpaes-x86_64-win.asm
@@ -42,9 +42,9 @@
 	movdqu	xmm5,XMMWORD[r9]
 	psrld	xmm1,4
 	pand	xmm0,xmm9
-DB	102,15,56,0,208
+	pshufb	xmm2,xmm0
 	movdqa	xmm0,XMMWORD[(($L$k_ipt+16))]
-DB	102,15,56,0,193
+	pshufb	xmm0,xmm1
 	pxor	xmm2,xmm5
 	add	r9,16
 	pxor	xmm0,xmm2
@@ -56,25 +56,25 @@
 
 	movdqa	xmm4,xmm13
 	movdqa	xmm0,xmm12
-DB	102,15,56,0,226
-DB	102,15,56,0,195
+	pshufb	xmm4,xmm2
+	pshufb	xmm0,xmm3
 	pxor	xmm4,xmm5
 	movdqa	xmm5,xmm15
 	pxor	xmm0,xmm4
 	movdqa	xmm1,XMMWORD[((-64))+r10*1+r11]
-DB	102,15,56,0,234
+	pshufb	xmm5,xmm2
 	movdqa	xmm4,XMMWORD[r10*1+r11]
 	movdqa	xmm2,xmm14
-DB	102,15,56,0,211
+	pshufb	xmm2,xmm3
 	movdqa	xmm3,xmm0
 	pxor	xmm2,xmm5
-DB	102,15,56,0,193
+	pshufb	xmm0,xmm1
 	add	r9,16
 	pxor	xmm0,xmm2
-DB	102,15,56,0,220
+	pshufb	xmm3,xmm4
 	add	r11,16
 	pxor	xmm3,xmm0
-DB	102,15,56,0,193
+	pshufb	xmm0,xmm1
 	and	r11,0x30
 	sub	rax,1
 	pxor	xmm0,xmm3
@@ -86,19 +86,19 @@
 	pandn	xmm1,xmm0
 	psrld	xmm1,4
 	pand	xmm0,xmm9
-DB	102,15,56,0,232
+	pshufb	xmm5,xmm0
 	movdqa	xmm3,xmm10
 	pxor	xmm0,xmm1
-DB	102,15,56,0,217
+	pshufb	xmm3,xmm1
 	movdqa	xmm4,xmm10
 	pxor	xmm3,xmm5
-DB	102,15,56,0,224
+	pshufb	xmm4,xmm0
 	movdqa	xmm2,xmm10
 	pxor	xmm4,xmm5
-DB	102,15,56,0,211
+	pshufb	xmm2,xmm3
 	movdqa	xmm3,xmm10
 	pxor	xmm2,xmm0
-DB	102,15,56,0,220
+	pshufb	xmm3,xmm4
 	movdqu	xmm5,XMMWORD[r9]
 	pxor	xmm3,xmm1
 	jnz	NEAR $L$enc_loop
@@ -106,12 +106,12 @@
 
 	movdqa	xmm4,XMMWORD[((-96))+r10]
 	movdqa	xmm0,XMMWORD[((-80))+r10]
-DB	102,15,56,0,226
+	pshufb	xmm4,xmm2
 	pxor	xmm4,xmm5
-DB	102,15,56,0,195
+	pshufb	xmm0,xmm3
 	movdqa	xmm1,XMMWORD[64+r10*1+r11]
 	pxor	xmm0,xmm4
-DB	102,15,56,0,193
+	pshufb	xmm0,xmm1
 	ret
 
 
@@ -164,12 +164,12 @@
 	psrld	xmm7,4
 	pand	xmm0,xmm9
 	pand	xmm6,xmm9
-DB	102,15,56,0,208
-DB	102,68,15,56,0,198
+	pshufb	xmm2,xmm0
+	pshufb	xmm8,xmm6
 	movdqa	xmm0,XMMWORD[(($L$k_ipt+16))]
 	movdqa	xmm6,xmm0
-DB	102,15,56,0,193
-DB	102,15,56,0,247
+	pshufb	xmm0,xmm1
+	pshufb	xmm6,xmm7
 	pxor	xmm2,xmm5
 	pxor	xmm8,xmm5
 	add	r9,16
@@ -185,10 +185,10 @@
 	movdqa	xmm0,XMMWORD[(($L$k_sb1+16))]
 	movdqa	xmm12,xmm4
 	movdqa	xmm6,xmm0
-DB	102,15,56,0,226
-DB	102,69,15,56,0,224
-DB	102,15,56,0,195
-DB	102,65,15,56,0,243
+	pshufb	xmm4,xmm2
+	pshufb	xmm12,xmm8
+	pshufb	xmm0,xmm3
+	pshufb	xmm6,xmm11
 	pxor	xmm4,xmm5
 	pxor	xmm12,xmm5
 	movdqa	xmm5,XMMWORD[$L$k_sb2]
@@ -197,30 +197,30 @@
 	pxor	xmm6,xmm12
 	movdqa	xmm1,XMMWORD[((-64))+r10*1+r11]
 
-DB	102,15,56,0,234
-DB	102,69,15,56,0,232
+	pshufb	xmm5,xmm2
+	pshufb	xmm13,xmm8
 	movdqa	xmm4,XMMWORD[r10*1+r11]
 
 	movdqa	xmm2,XMMWORD[(($L$k_sb2+16))]
 	movdqa	xmm8,xmm2
-DB	102,15,56,0,211
-DB	102,69,15,56,0,195
+	pshufb	xmm2,xmm3
+	pshufb	xmm8,xmm11
 	movdqa	xmm3,xmm0
 	movdqa	xmm11,xmm6
 	pxor	xmm2,xmm5
 	pxor	xmm8,xmm13
-DB	102,15,56,0,193
-DB	102,15,56,0,241
+	pshufb	xmm0,xmm1
+	pshufb	xmm6,xmm1
 	add	r9,16
 	pxor	xmm0,xmm2
 	pxor	xmm6,xmm8
-DB	102,15,56,0,220
-DB	102,68,15,56,0,220
+	pshufb	xmm3,xmm4
+	pshufb	xmm11,xmm4
 	add	r11,16
 	pxor	xmm3,xmm0
 	pxor	xmm11,xmm6
-DB	102,15,56,0,193
-DB	102,15,56,0,241
+	pshufb	xmm0,xmm1
+	pshufb	xmm6,xmm1
 	and	r11,0x30
 	sub	rax,1
 	pxor	xmm0,xmm3
@@ -238,32 +238,32 @@
 	psrld	xmm7,4
 	pand	xmm0,xmm9
 	pand	xmm6,xmm9
-DB	102,15,56,0,232
-DB	102,68,15,56,0,238
+	pshufb	xmm5,xmm0
+	pshufb	xmm13,xmm6
 	movdqa	xmm3,xmm10
 	movdqa	xmm11,xmm10
 	pxor	xmm0,xmm1
 	pxor	xmm6,xmm7
-DB	102,15,56,0,217
-DB	102,68,15,56,0,223
+	pshufb	xmm3,xmm1
+	pshufb	xmm11,xmm7
 	movdqa	xmm4,xmm10
 	movdqa	xmm12,xmm10
 	pxor	xmm3,xmm5
 	pxor	xmm11,xmm13
-DB	102,15,56,0,224
-DB	102,68,15,56,0,230
+	pshufb	xmm4,xmm0
+	pshufb	xmm12,xmm6
 	movdqa	xmm2,xmm10
 	movdqa	xmm8,xmm10
 	pxor	xmm4,xmm5
 	pxor	xmm12,xmm13
-DB	102,15,56,0,211
-DB	102,69,15,56,0,195
+	pshufb	xmm2,xmm3
+	pshufb	xmm8,xmm11
 	movdqa	xmm3,xmm10
 	movdqa	xmm11,xmm10
 	pxor	xmm2,xmm0
 	pxor	xmm8,xmm6
-DB	102,15,56,0,220
-DB	102,69,15,56,0,220
+	pshufb	xmm3,xmm4
+	pshufb	xmm11,xmm12
 	movdqu	xmm5,XMMWORD[r9]
 
 	pxor	xmm3,xmm1
@@ -275,18 +275,18 @@
 	movdqa	xmm0,XMMWORD[((-80))+r10]
 	movdqa	xmm12,xmm4
 	movdqa	xmm6,xmm0
-DB	102,15,56,0,226
-DB	102,69,15,56,0,224
+	pshufb	xmm4,xmm2
+	pshufb	xmm12,xmm8
 	pxor	xmm4,xmm5
 	pxor	xmm12,xmm5
-DB	102,15,56,0,195
-DB	102,65,15,56,0,243
+	pshufb	xmm0,xmm3
+	pshufb	xmm6,xmm11
 	movdqa	xmm1,XMMWORD[64+r10*1+r11]
 
 	pxor	xmm0,xmm4
 	pxor	xmm6,xmm12
-DB	102,15,56,0,193
-DB	102,15,56,0,241
+	pshufb	xmm0,xmm1
+	pshufb	xmm6,xmm1
 	ret
 
 
@@ -310,11 +310,11 @@
 	movdqu	xmm5,XMMWORD[r9]
 	shl	r11,4
 	pand	xmm0,xmm9
-DB	102,15,56,0,208
+	pshufb	xmm2,xmm0
 	movdqa	xmm0,XMMWORD[(($L$k_dipt+16))]
 	xor	r11,0x30
 	lea	r10,[$L$k_dsbd]
-DB	102,15,56,0,193
+	pshufb	xmm0,xmm1
 	and	r11,0x30
 	pxor	xmm2,xmm5
 	movdqa	xmm5,XMMWORD[(($L$k_mc_forward+48))]
@@ -330,35 +330,35 @@
 
 	movdqa	xmm4,XMMWORD[((-32))+r10]
 	movdqa	xmm1,XMMWORD[((-16))+r10]
-DB	102,15,56,0,226
-DB	102,15,56,0,203
+	pshufb	xmm4,xmm2
+	pshufb	xmm1,xmm3
 	pxor	xmm0,xmm4
 	movdqa	xmm4,XMMWORD[r10]
 	pxor	xmm0,xmm1
 	movdqa	xmm1,XMMWORD[16+r10]
 
-DB	102,15,56,0,226
-DB	102,15,56,0,197
-DB	102,15,56,0,203
+	pshufb	xmm4,xmm2
+	pshufb	xmm0,xmm5
+	pshufb	xmm1,xmm3
 	pxor	xmm0,xmm4
 	movdqa	xmm4,XMMWORD[32+r10]
 	pxor	xmm0,xmm1
 	movdqa	xmm1,XMMWORD[48+r10]
 
-DB	102,15,56,0,226
-DB	102,15,56,0,197
-DB	102,15,56,0,203
+	pshufb	xmm4,xmm2
+	pshufb	xmm0,xmm5
+	pshufb	xmm1,xmm3
 	pxor	xmm0,xmm4
 	movdqa	xmm4,XMMWORD[64+r10]
 	pxor	xmm0,xmm1
 	movdqa	xmm1,XMMWORD[80+r10]
 
-DB	102,15,56,0,226
-DB	102,15,56,0,197
-DB	102,15,56,0,203
+	pshufb	xmm4,xmm2
+	pshufb	xmm0,xmm5
+	pshufb	xmm1,xmm3
 	pxor	xmm0,xmm4
 	add	r9,16
-DB	102,15,58,15,237,12
+	palignr	xmm5,xmm5,12
 	pxor	xmm0,xmm1
 	sub	rax,1
 
@@ -369,32 +369,32 @@
 	movdqa	xmm2,xmm11
 	psrld	xmm1,4
 	pand	xmm0,xmm9
-DB	102,15,56,0,208
+	pshufb	xmm2,xmm0
 	movdqa	xmm3,xmm10
 	pxor	xmm0,xmm1
-DB	102,15,56,0,217
+	pshufb	xmm3,xmm1
 	movdqa	xmm4,xmm10
 	pxor	xmm3,xmm2
-DB	102,15,56,0,224
+	pshufb	xmm4,xmm0
 	pxor	xmm4,xmm2
 	movdqa	xmm2,xmm10
-DB	102,15,56,0,211
+	pshufb	xmm2,xmm3
 	movdqa	xmm3,xmm10
 	pxor	xmm2,xmm0
-DB	102,15,56,0,220
+	pshufb	xmm3,xmm4
 	movdqu	xmm0,XMMWORD[r9]
 	pxor	xmm3,xmm1
 	jnz	NEAR $L$dec_loop
 
 
 	movdqa	xmm4,XMMWORD[96+r10]
-DB	102,15,56,0,226
+	pshufb	xmm4,xmm2
 	pxor	xmm4,xmm0
 	movdqa	xmm0,XMMWORD[112+r10]
 	movdqa	xmm2,XMMWORD[((-352))+r11]
-DB	102,15,56,0,195
+	pshufb	xmm0,xmm3
 	pxor	xmm0,xmm4
-DB	102,15,56,0,194
+	pshufb	xmm0,xmm2
 	ret
 
 
@@ -434,7 +434,7 @@
 $L$schedule_am_decrypting:
 
 	movdqa	xmm1,XMMWORD[r10*1+r8]
-DB	102,15,56,0,217
+	pshufb	xmm3,xmm1
 	movdqu	XMMWORD[rdx],xmm3
 	xor	r8,0x30
 
@@ -488,7 +488,7 @@
 
 $L$oop_schedule_192:
 	call	_vpaes_schedule_round
-DB	102,15,58,15,198,8
+	palignr	xmm0,xmm6,8
 	call	_vpaes_schedule_mangle
 	call	_vpaes_schedule_192_smear
 	call	_vpaes_schedule_mangle
@@ -554,7 +554,7 @@
 
 
 	movdqa	xmm1,XMMWORD[r10*1+r8]
-DB	102,15,56,0,193
+	pshufb	xmm0,xmm1
 	lea	r11,[$L$k_opt]
 	add	rdx,32
 
@@ -630,13 +630,13 @@
 
 
 	pxor	xmm1,xmm1
-DB	102,65,15,58,15,200,15
-DB	102,69,15,58,15,192,15
+	palignr	xmm1,xmm8,15
+	palignr	xmm8,xmm8,15
 	pxor	xmm7,xmm1
 
 
 	pshufd	xmm0,xmm0,0xFF
-DB	102,15,58,15,192,1
+	palignr	xmm0,xmm0,1
 
 
 
@@ -657,24 +657,24 @@
 	psrld	xmm1,4
 	pand	xmm0,xmm9
 	movdqa	xmm2,xmm11
-DB	102,15,56,0,208
+	pshufb	xmm2,xmm0
 	pxor	xmm0,xmm1
 	movdqa	xmm3,xmm10
-DB	102,15,56,0,217
+	pshufb	xmm3,xmm1
 	pxor	xmm3,xmm2
 	movdqa	xmm4,xmm10
-DB	102,15,56,0,224
+	pshufb	xmm4,xmm0
 	pxor	xmm4,xmm2
 	movdqa	xmm2,xmm10
-DB	102,15,56,0,211
+	pshufb	xmm2,xmm3
 	pxor	xmm2,xmm0
 	movdqa	xmm3,xmm10
-DB	102,15,56,0,220
+	pshufb	xmm3,xmm4
 	pxor	xmm3,xmm1
 	movdqa	xmm4,xmm13
-DB	102,15,56,0,226
+	pshufb	xmm4,xmm2
 	movdqa	xmm0,xmm12
-DB	102,15,56,0,195
+	pshufb	xmm0,xmm3
 	pxor	xmm0,xmm4
 
 
@@ -702,9 +702,9 @@
 	psrld	xmm1,4
 	pand	xmm0,xmm9
 	movdqa	xmm2,XMMWORD[r11]
-DB	102,15,56,0,208
+	pshufb	xmm2,xmm0
 	movdqa	xmm0,XMMWORD[16+r11]
-DB	102,15,56,0,193
+	pshufb	xmm0,xmm1
 	pxor	xmm0,xmm2
 	ret
 
@@ -745,11 +745,11 @@
 
 	add	rdx,16
 	pxor	xmm4,XMMWORD[$L$k_s63]
-DB	102,15,56,0,229
+	pshufb	xmm4,xmm5
 	movdqa	xmm3,xmm4
-DB	102,15,56,0,229
+	pshufb	xmm4,xmm5
 	pxor	xmm3,xmm4
-DB	102,15,56,0,229
+	pshufb	xmm4,xmm5
 	pxor	xmm3,xmm4
 
 	jmp	NEAR $L$schedule_mangle_both
@@ -763,40 +763,40 @@
 	pand	xmm4,xmm9
 
 	movdqa	xmm2,XMMWORD[r11]
-DB	102,15,56,0,212
+	pshufb	xmm2,xmm4
 	movdqa	xmm3,XMMWORD[16+r11]
-DB	102,15,56,0,217
+	pshufb	xmm3,xmm1
 	pxor	xmm3,xmm2
-DB	102,15,56,0,221
+	pshufb	xmm3,xmm5
 
 	movdqa	xmm2,XMMWORD[32+r11]
-DB	102,15,56,0,212
+	pshufb	xmm2,xmm4
 	pxor	xmm2,xmm3
 	movdqa	xmm3,XMMWORD[48+r11]
-DB	102,15,56,0,217
+	pshufb	xmm3,xmm1
 	pxor	xmm3,xmm2
-DB	102,15,56,0,221
+	pshufb	xmm3,xmm5
 
 	movdqa	xmm2,XMMWORD[64+r11]
-DB	102,15,56,0,212
+	pshufb	xmm2,xmm4
 	pxor	xmm2,xmm3
 	movdqa	xmm3,XMMWORD[80+r11]
-DB	102,15,56,0,217
+	pshufb	xmm3,xmm1
 	pxor	xmm3,xmm2
-DB	102,15,56,0,221
+	pshufb	xmm3,xmm5
 
 	movdqa	xmm2,XMMWORD[96+r11]
-DB	102,15,56,0,212
+	pshufb	xmm2,xmm4
 	pxor	xmm2,xmm3
 	movdqa	xmm3,XMMWORD[112+r11]
-DB	102,15,56,0,217
+	pshufb	xmm3,xmm1
 	pxor	xmm3,xmm2
 
 	add	rdx,-16
 
 $L$schedule_mangle_both:
 	movdqa	xmm1,XMMWORD[r10*1+r8]
-DB	102,15,56,0,217
+	pshufb	xmm3,xmm1
 	add	r8,-16
 	and	r8,0x30
 	movdqu	XMMWORD[rdx],xmm3
@@ -1172,8 +1172,8 @@
 	movdqa	xmm1,XMMWORD[$L$rev_ctr]
 	movdqa	xmm0,xmm14
 	movdqa	xmm6,xmm15
-DB	102,15,56,0,193
-DB	102,15,56,0,241
+	pshufb	xmm0,xmm1
+	pshufb	xmm6,xmm1
 	call	_vpaes_encrypt_core_2x
 	movdqu	xmm1,XMMWORD[rdi]
 	movdqu	xmm2,XMMWORD[16+rdi]
diff --git a/gen/bcm/x86_64-mont-apple.S b/gen/bcm/x86_64-mont-apple.S
index 4bf0c6d..d429f7c 100644
--- a/gen/bcm/x86_64-mont-apple.S
+++ b/gen/bcm/x86_64-mont-apple.S
@@ -632,7 +632,7 @@
 	sbbq	$0,%rax
 	movq	%rbp,24(%rdi,%r14,8)
 	pxor	%xmm0,%xmm0
-.byte	102,72,15,110,224
+	movq	%rax,%xmm4
 	pcmpeqd	%xmm5,%xmm5
 	pshufd	$0,%xmm4,%xmm4
 	movq	%r9,%r15
@@ -764,10 +764,10 @@
 
 L$sqr8x_body:
 
-.byte	102,72,15,110,209
+	movq	%rcx,%xmm2
 	pxor	%xmm0,%xmm0
-.byte	102,72,15,110,207
-.byte	102,73,15,110,218
+	movq	%rdi,%xmm1
+	movq	%r10,%xmm3
 	testq	%rdx,%rdx
 	jz	L$sqr8x_nox
 
@@ -779,7 +779,7 @@
 	leaq	(%r8,%rcx,1),%rbx
 	movq	%rcx,%r9
 	movq	%rcx,%rdx
-.byte	102,72,15,126,207
+	movq	%xmm1,%rdi
 	sarq	$3+2,%rcx
 	jmp	L$sqr8x_sub
 
@@ -793,7 +793,7 @@
 	leaq	(%rdi,%r9,1),%rbx
 	movq	%r9,%rcx
 	movq	%r9,%rdx
-.byte	102,72,15,126,207
+	movq	%xmm1,%rdi
 	sarq	$3+2,%rcx
 	jmp	L$sqr8x_sub
 
@@ -821,7 +821,7 @@
 	leaq	(%rbx,%r9,1),%rbx
 	leaq	(%rdi,%r9,1),%rdi
 
-.byte	102,72,15,110,200
+	movq	%rax,%xmm1
 	pxor	%xmm0,%xmm0
 	pshufd	$0,%xmm1,%xmm1
 	movq	40(%rsp),%rsi
@@ -1179,7 +1179,7 @@
 	leaq	64(%rsp),%rbx
 	subq	%rdx,%rdi
 
-.byte	102,73,15,110,207
+	movq	%r15,%xmm1
 	pxor	%xmm0,%xmm0
 	pshufd	$0,%xmm1,%xmm1
 	movq	40(%rsp),%rsi
diff --git a/gen/bcm/x86_64-mont-linux.S b/gen/bcm/x86_64-mont-linux.S
index 02b282d..630bb72 100644
--- a/gen/bcm/x86_64-mont-linux.S
+++ b/gen/bcm/x86_64-mont-linux.S
@@ -632,7 +632,7 @@
 	sbbq	$0,%rax
 	movq	%rbp,24(%rdi,%r14,8)
 	pxor	%xmm0,%xmm0
-.byte	102,72,15,110,224
+	movq	%rax,%xmm4
 	pcmpeqd	%xmm5,%xmm5
 	pshufd	$0,%xmm4,%xmm4
 	movq	%r9,%r15
@@ -766,10 +766,10 @@
 .cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
 .Lsqr8x_body:
 
-.byte	102,72,15,110,209
+	movq	%rcx,%xmm2
 	pxor	%xmm0,%xmm0
-.byte	102,72,15,110,207
-.byte	102,73,15,110,218
+	movq	%rdi,%xmm1
+	movq	%r10,%xmm3
 	testq	%rdx,%rdx
 	jz	.Lsqr8x_nox
 
@@ -781,7 +781,7 @@
 	leaq	(%r8,%rcx,1),%rbx
 	movq	%rcx,%r9
 	movq	%rcx,%rdx
-.byte	102,72,15,126,207
+	movq	%xmm1,%rdi
 	sarq	$3+2,%rcx
 	jmp	.Lsqr8x_sub
 
@@ -795,7 +795,7 @@
 	leaq	(%rdi,%r9,1),%rbx
 	movq	%r9,%rcx
 	movq	%r9,%rdx
-.byte	102,72,15,126,207
+	movq	%xmm1,%rdi
 	sarq	$3+2,%rcx
 	jmp	.Lsqr8x_sub
 
@@ -823,7 +823,7 @@
 	leaq	(%rbx,%r9,1),%rbx
 	leaq	(%rdi,%r9,1),%rdi
 
-.byte	102,72,15,110,200
+	movq	%rax,%xmm1
 	pxor	%xmm0,%xmm0
 	pshufd	$0,%xmm1,%xmm1
 	movq	40(%rsp),%rsi
@@ -1181,7 +1181,7 @@
 	leaq	64(%rsp),%rbx
 	subq	%rdx,%rdi
 
-.byte	102,73,15,110,207
+	movq	%r15,%xmm1
 	pxor	%xmm0,%xmm0
 	pshufd	$0,%xmm1,%xmm1
 	movq	40(%rsp),%rsi
diff --git a/gen/bcm/x86_64-mont-win.asm b/gen/bcm/x86_64-mont-win.asm
index b0611fc..7e54c66 100644
--- a/gen/bcm/x86_64-mont-win.asm
+++ b/gen/bcm/x86_64-mont-win.asm
@@ -664,7 +664,7 @@
 	sbb	rax,0
 	mov	QWORD[24+r14*8+rdi],rbp
 	pxor	xmm0,xmm0
-DB	102,72,15,110,224
+	movq	xmm4,rax
 	pcmpeqd	xmm5,xmm5
 	pshufd	xmm4,xmm4,0
 	mov	r15,r9
@@ -809,10 +809,10 @@
 
 $L$sqr8x_body:
 
-DB	102,72,15,110,209
+	movq	xmm2,rcx
 	pxor	xmm0,xmm0
-DB	102,72,15,110,207
-DB	102,73,15,110,218
+	movq	xmm1,rdi
+	movq	xmm3,r10
 	test	rdx,rdx
 	jz	NEAR $L$sqr8x_nox
 
@@ -824,7 +824,7 @@
 	lea	rbx,[rcx*1+r8]
 	mov	r9,rcx
 	mov	rdx,rcx
-DB	102,72,15,126,207
+	movq	rdi,xmm1
 	sar	rcx,3+2
 	jmp	NEAR $L$sqr8x_sub
 
@@ -838,7 +838,7 @@
 	lea	rbx,[r9*1+rdi]
 	mov	rcx,r9
 	mov	rdx,r9
-DB	102,72,15,126,207
+	movq	rdi,xmm1
 	sar	rcx,3+2
 	jmp	NEAR $L$sqr8x_sub
 
@@ -866,7 +866,7 @@
 	lea	rbx,[r9*1+rbx]
 	lea	rdi,[r9*1+rdi]
 
-DB	102,72,15,110,200
+	movq	xmm1,rax
 	pxor	xmm0,xmm0
 	pshufd	xmm1,xmm1,0
 	mov	rsi,QWORD[40+rsp]
@@ -1237,7 +1237,7 @@
 	lea	rbx,[64+rsp]
 	sub	rdi,rdx
 
-DB	102,73,15,110,207
+	movq	xmm1,r15
 	pxor	xmm0,xmm0
 	pshufd	xmm1,xmm1,0
 	mov	rsi,QWORD[40+rsp]
diff --git a/gen/bcm/x86_64-mont5-apple.S b/gen/bcm/x86_64-mont5-apple.S
index 5cf770f..cd7d797 100644
--- a/gen/bcm/x86_64-mont5-apple.S
+++ b/gen/bcm/x86_64-mont5-apple.S
@@ -196,7 +196,7 @@
 	pshufd	$0x4e,%xmm0,%xmm1
 	por	%xmm1,%xmm0
 	leaq	256(%r12),%r12
-.byte	102,72,15,126,195
+	movq	%xmm0,%rbx
 
 	movq	(%r8),%r8
 	movq	(%rsi),%rax
@@ -322,7 +322,7 @@
 	leaq	256(%r12),%r12
 
 	movq	(%rsi),%rax
-.byte	102,72,15,126,195
+	movq	%xmm0,%rbx
 
 	xorq	%r15,%r15
 	movq	%r8,%rbp
@@ -691,7 +691,7 @@
 	pshufd	$0x4e,%xmm0,%xmm1
 	por	%xmm1,%xmm0
 	leaq	256(%r12),%r12
-.byte	102,72,15,126,195
+	movq	%xmm0,%rbx
 
 	movq	%r13,16+8(%rsp)
 	movq	%rdi,56+8(%rsp)
@@ -899,7 +899,7 @@
 	pshufd	$0x4e,%xmm4,%xmm0
 	por	%xmm4,%xmm0
 	leaq	256(%r12),%r12
-.byte	102,72,15,126,195
+	movq	%xmm0,%rbx
 
 	movq	(%r14,%r9,1),%r10
 	movq	%r8,%rbp
@@ -1171,10 +1171,10 @@
 	movq	%rax,40(%rsp)
 
 L$power5_body:
-.byte	102,72,15,110,207
-.byte	102,72,15,110,209
-.byte	102,73,15,110,218
-.byte	102,72,15,110,226
+	movq	%rdi,%xmm1
+	movq	%rcx,%xmm2
+	movq	%r10,%xmm3
+	movq	%rdx,%xmm4
 
 	call	__bn_sqr8x_internal
 	call	__bn_post4x_internal
@@ -1187,8 +1187,8 @@
 	call	__bn_sqr8x_internal
 	call	__bn_post4x_internal
 
-.byte	102,72,15,126,209
-.byte	102,72,15,126,226
+	movq	%xmm2,%rcx
+	movq	%xmm4,%rdx
 	movq	%rsi,%rdi
 	movq	40(%rsp),%rax
 	leaq	32(%rsp),%r8
@@ -1740,7 +1740,7 @@
 	adcq	%rdx,%r8
 	movq	%rbx,-16(%rdi)
 	movq	%r8,-8(%rdi)
-.byte	102,72,15,126,213
+	movq	%xmm2,%rbp
 __bn_sqr8x_reduction:
 	xorq	%rax,%rax
 	leaq	(%r9,%rbp,1),%rcx
@@ -1984,11 +1984,11 @@
 	movq	-8(%rbp),%rcx
 	xorq	%rsi,%rsi
 
-.byte	102,72,15,126,213
+	movq	%xmm2,%rbp
 
 	movq	%r8,0(%rdi)
 	movq	%r9,8(%rdi)
-.byte	102,73,15,126,217
+	movq	%xmm3,%r9
 	movq	%r10,16(%rdi)
 	movq	%r11,24(%rdi)
 	movq	%r12,32(%rdi)
@@ -2009,9 +2009,9 @@
 	movq	0(%rbp),%r12
 	leaq	(%rdi,%r9,1),%rbx
 	movq	%r9,%rcx
-.byte	102,72,15,126,207
+	movq	%xmm1,%rdi
 	negq	%rax
-.byte	102,72,15,126,206
+	movq	%xmm1,%rsi
 	sarq	$3+2,%rcx
 	decq	%r12
 	xorq	%r10,%r10
@@ -2321,7 +2321,7 @@
 	pshufd	$0x4e,%xmm0,%xmm1
 	por	%xmm1,%xmm0
 	leaq	256(%rdi),%rdi
-.byte	102,72,15,126,194
+	movq	%xmm0,%rdx
 	leaq	64+32+8(%rsp),%rbx
 
 	movq	%rdx,%r9
@@ -2472,7 +2472,7 @@
 	pshufd	$0x4e,%xmm4,%xmm0
 	por	%xmm4,%xmm0
 	leaq	256(%rdi),%rdi
-.byte	102,72,15,126,194
+	movq	%xmm0,%rdx
 
 	movq	%rbp,(%rbx)
 	leaq	32(%rbx,%rax,1),%rbx
@@ -2689,10 +2689,10 @@
 
 
 	pxor	%xmm0,%xmm0
-.byte	102,72,15,110,207
-.byte	102,72,15,110,209
-.byte	102,73,15,110,218
-.byte	102,72,15,110,226
+	movq	%rdi,%xmm1
+	movq	%rcx,%xmm2
+	movq	%r10,%xmm3
+	movq	%rdx,%xmm4
 	movq	%r8,32(%rsp)
 	movq	%rax,40(%rsp)
 
@@ -2711,8 +2711,8 @@
 
 	movq	%r10,%r9
 	movq	%rsi,%rdi
-.byte	102,72,15,126,209
-.byte	102,72,15,126,226
+	movq	%xmm2,%rcx
+	movq	%xmm4,%rdx
 	movq	40(%rsp),%rax
 
 	call	mulx4x_internal
@@ -3077,7 +3077,7 @@
 .p2align	5
 L$sqrx8x_outer_break:
 	movq	%r9,72(%rdi)
-.byte	102,72,15,126,217
+	movq	%xmm3,%rcx
 	movq	%r10,80(%rdi)
 	movq	%r11,88(%rdi)
 	movq	%r12,96(%rdi)
@@ -3151,7 +3151,7 @@
 	movq	%rax,48(%rdi)
 	movq	%rbx,56(%rdi)
 	leaq	64(%rdi),%rdi
-.byte	102,72,15,126,213
+	movq	%xmm2,%rbp
 __bn_sqrx8x_reduction:
 	xorl	%eax,%eax
 	movq	32+8(%rsp),%rbx
@@ -3331,10 +3331,10 @@
 	subq	16+8(%rsp),%rsi
 L$sqrx8x_no_tail:
 	adcq	0(%rdi),%r8
-.byte	102,72,15,126,217
+	movq	%xmm3,%rcx
 	adcq	8(%rdi),%r9
 	movq	56(%rbp),%rsi
-.byte	102,72,15,126,213
+	movq	%xmm2,%rbp
 	adcq	16(%rdi),%r10
 	adcq	24(%rdi),%r11
 	adcq	32(%rdi),%r12
@@ -3372,8 +3372,8 @@
 	negq	%rax
 	sarq	$3+2,%rcx
 
-.byte	102,72,15,126,202
-.byte	102,72,15,126,206
+	movq	%xmm1,%rdx
+	movq	%xmm1,%rsi
 	decq	%r12
 	movq	8(%rbp),%r13
 	xorq	%r8,%r8
diff --git a/gen/bcm/x86_64-mont5-linux.S b/gen/bcm/x86_64-mont5-linux.S
index dcc02fc..a147041 100644
--- a/gen/bcm/x86_64-mont5-linux.S
+++ b/gen/bcm/x86_64-mont5-linux.S
@@ -196,7 +196,7 @@
 	pshufd	$0x4e,%xmm0,%xmm1
 	por	%xmm1,%xmm0
 	leaq	256(%r12),%r12
-.byte	102,72,15,126,195
+	movq	%xmm0,%rbx
 
 	movq	(%r8),%r8
 	movq	(%rsi),%rax
@@ -322,7 +322,7 @@
 	leaq	256(%r12),%r12
 
 	movq	(%rsi),%rax
-.byte	102,72,15,126,195
+	movq	%xmm0,%rbx
 
 	xorq	%r15,%r15
 	movq	%r8,%rbp
@@ -691,7 +691,7 @@
 	pshufd	$0x4e,%xmm0,%xmm1
 	por	%xmm1,%xmm0
 	leaq	256(%r12),%r12
-.byte	102,72,15,126,195
+	movq	%xmm0,%rbx
 
 	movq	%r13,16+8(%rsp)
 	movq	%rdi,56+8(%rsp)
@@ -899,7 +899,7 @@
 	pshufd	$0x4e,%xmm4,%xmm0
 	por	%xmm4,%xmm0
 	leaq	256(%r12),%r12
-.byte	102,72,15,126,195
+	movq	%xmm0,%rbx
 
 	movq	(%r14,%r9,1),%r10
 	movq	%r8,%rbp
@@ -1171,10 +1171,10 @@
 	movq	%rax,40(%rsp)
 .cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
 .Lpower5_body:
-.byte	102,72,15,110,207
-.byte	102,72,15,110,209
-.byte	102,73,15,110,218
-.byte	102,72,15,110,226
+	movq	%rdi,%xmm1
+	movq	%rcx,%xmm2
+	movq	%r10,%xmm3
+	movq	%rdx,%xmm4
 
 	call	__bn_sqr8x_internal
 	call	__bn_post4x_internal
@@ -1187,8 +1187,8 @@
 	call	__bn_sqr8x_internal
 	call	__bn_post4x_internal
 
-.byte	102,72,15,126,209
-.byte	102,72,15,126,226
+	movq	%xmm2,%rcx
+	movq	%xmm4,%rdx
 	movq	%rsi,%rdi
 	movq	40(%rsp),%rax
 	leaq	32(%rsp),%r8
@@ -1740,7 +1740,7 @@
 	adcq	%rdx,%r8
 	movq	%rbx,-16(%rdi)
 	movq	%r8,-8(%rdi)
-.byte	102,72,15,126,213
+	movq	%xmm2,%rbp
 __bn_sqr8x_reduction:
 	xorq	%rax,%rax
 	leaq	(%r9,%rbp,1),%rcx
@@ -1984,11 +1984,11 @@
 	movq	-8(%rbp),%rcx
 	xorq	%rsi,%rsi
 
-.byte	102,72,15,126,213
+	movq	%xmm2,%rbp
 
 	movq	%r8,0(%rdi)
 	movq	%r9,8(%rdi)
-.byte	102,73,15,126,217
+	movq	%xmm3,%r9
 	movq	%r10,16(%rdi)
 	movq	%r11,24(%rdi)
 	movq	%r12,32(%rdi)
@@ -2009,9 +2009,9 @@
 	movq	0(%rbp),%r12
 	leaq	(%rdi,%r9,1),%rbx
 	movq	%r9,%rcx
-.byte	102,72,15,126,207
+	movq	%xmm1,%rdi
 	negq	%rax
-.byte	102,72,15,126,206
+	movq	%xmm1,%rsi
 	sarq	$3+2,%rcx
 	decq	%r12
 	xorq	%r10,%r10
@@ -2321,7 +2321,7 @@
 	pshufd	$0x4e,%xmm0,%xmm1
 	por	%xmm1,%xmm0
 	leaq	256(%rdi),%rdi
-.byte	102,72,15,126,194
+	movq	%xmm0,%rdx
 	leaq	64+32+8(%rsp),%rbx
 
 	movq	%rdx,%r9
@@ -2472,7 +2472,7 @@
 	pshufd	$0x4e,%xmm4,%xmm0
 	por	%xmm4,%xmm0
 	leaq	256(%rdi),%rdi
-.byte	102,72,15,126,194
+	movq	%xmm0,%rdx
 
 	movq	%rbp,(%rbx)
 	leaq	32(%rbx,%rax,1),%rbx
@@ -2689,10 +2689,10 @@
 
 
 	pxor	%xmm0,%xmm0
-.byte	102,72,15,110,207
-.byte	102,72,15,110,209
-.byte	102,73,15,110,218
-.byte	102,72,15,110,226
+	movq	%rdi,%xmm1
+	movq	%rcx,%xmm2
+	movq	%r10,%xmm3
+	movq	%rdx,%xmm4
 	movq	%r8,32(%rsp)
 	movq	%rax,40(%rsp)
 .cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
@@ -2711,8 +2711,8 @@
 
 	movq	%r10,%r9
 	movq	%rsi,%rdi
-.byte	102,72,15,126,209
-.byte	102,72,15,126,226
+	movq	%xmm2,%rcx
+	movq	%xmm4,%rdx
 	movq	40(%rsp),%rax
 
 	call	mulx4x_internal
@@ -3077,7 +3077,7 @@
 .align	32
 .Lsqrx8x_outer_break:
 	movq	%r9,72(%rdi)
-.byte	102,72,15,126,217
+	movq	%xmm3,%rcx
 	movq	%r10,80(%rdi)
 	movq	%r11,88(%rdi)
 	movq	%r12,96(%rdi)
@@ -3151,7 +3151,7 @@
 	movq	%rax,48(%rdi)
 	movq	%rbx,56(%rdi)
 	leaq	64(%rdi),%rdi
-.byte	102,72,15,126,213
+	movq	%xmm2,%rbp
 __bn_sqrx8x_reduction:
 	xorl	%eax,%eax
 	movq	32+8(%rsp),%rbx
@@ -3331,10 +3331,10 @@
 	subq	16+8(%rsp),%rsi
 .Lsqrx8x_no_tail:
 	adcq	0(%rdi),%r8
-.byte	102,72,15,126,217
+	movq	%xmm3,%rcx
 	adcq	8(%rdi),%r9
 	movq	56(%rbp),%rsi
-.byte	102,72,15,126,213
+	movq	%xmm2,%rbp
 	adcq	16(%rdi),%r10
 	adcq	24(%rdi),%r11
 	adcq	32(%rdi),%r12
@@ -3372,8 +3372,8 @@
 	negq	%rax
 	sarq	$3+2,%rcx
 
-.byte	102,72,15,126,202
-.byte	102,72,15,126,206
+	movq	%xmm1,%rdx
+	movq	%xmm1,%rsi
 	decq	%r12
 	movq	8(%rbp),%r13
 	xorq	%r8,%r8
diff --git a/gen/bcm/x86_64-mont5-win.asm b/gen/bcm/x86_64-mont5-win.asm
index 3b12405..5ddeb86 100644
--- a/gen/bcm/x86_64-mont5-win.asm
+++ b/gen/bcm/x86_64-mont5-win.asm
@@ -215,7 +215,7 @@
 	pshufd	xmm1,xmm0,0x4e
 	por	xmm0,xmm1
 	lea	r12,[256+r12]
-DB	102,72,15,126,195
+	movq	rbx,xmm0
 
 	mov	r8,QWORD[r8]
 	mov	rax,QWORD[rsi]
@@ -341,7 +341,7 @@
 	lea	r12,[256+r12]
 
 	mov	rax,QWORD[rsi]
-DB	102,72,15,126,195
+	movq	rbx,xmm0
 
 	xor	r15,r15
 	mov	rbp,r8
@@ -725,7 +725,7 @@
 	pshufd	xmm1,xmm0,0x4e
 	por	xmm0,xmm1
 	lea	r12,[256+r12]
-DB	102,72,15,126,195
+	movq	rbx,xmm0
 
 	mov	QWORD[((16+8))+rsp],r13
 	mov	QWORD[((56+8))+rsp],rdi
@@ -933,7 +933,7 @@
 	pshufd	xmm0,xmm4,0x4e
 	por	xmm0,xmm4
 	lea	r12,[256+r12]
-DB	102,72,15,126,195
+	movq	rbx,xmm0
 
 	mov	r10,QWORD[r9*1+r14]
 	mov	rbp,r8
@@ -1216,10 +1216,10 @@
 	mov	QWORD[40+rsp],rax
 
 $L$power5_body:
-DB	102,72,15,110,207
-DB	102,72,15,110,209
-DB	102,73,15,110,218
-DB	102,72,15,110,226
+	movq	xmm1,rdi
+	movq	xmm2,rcx
+	movq	xmm3,r10
+	movq	xmm4,rdx
 
 	call	__bn_sqr8x_internal
 	call	__bn_post4x_internal
@@ -1232,8 +1232,8 @@
 	call	__bn_sqr8x_internal
 	call	__bn_post4x_internal
 
-DB	102,72,15,126,209
-DB	102,72,15,126,226
+	movq	rcx,xmm2
+	movq	rdx,xmm4
 	mov	rdi,rsi
 	mov	rax,QWORD[40+rsp]
 	lea	r8,[32+rsp]
@@ -1786,7 +1786,7 @@
 	adc	r8,rdx
 	mov	QWORD[((-16))+rdi],rbx
 	mov	QWORD[((-8))+rdi],r8
-DB	102,72,15,126,213
+	movq	rbp,xmm2
 __bn_sqr8x_reduction:
 	xor	rax,rax
 	lea	rcx,[rbp*1+r9]
@@ -2030,11 +2030,11 @@
 	mov	rcx,QWORD[((-8))+rbp]
 	xor	rsi,rsi
 
-DB	102,72,15,126,213
+	movq	rbp,xmm2
 
 	mov	QWORD[rdi],r8
 	mov	QWORD[8+rdi],r9
-DB	102,73,15,126,217
+	movq	r9,xmm3
 	mov	QWORD[16+rdi],r10
 	mov	QWORD[24+rdi],r11
 	mov	QWORD[32+rdi],r12
@@ -2055,9 +2055,9 @@
 	mov	r12,QWORD[rbp]
 	lea	rbx,[r9*1+rdi]
 	mov	rcx,r9
-DB	102,72,15,126,207
+	movq	rdi,xmm1
 	neg	rax
-DB	102,72,15,126,206
+	movq	rsi,xmm1
 	sar	rcx,3+2
 	dec	r12
 	xor	r10,r10
@@ -2380,7 +2380,7 @@
 	pshufd	xmm1,xmm0,0x4e
 	por	xmm0,xmm1
 	lea	rdi,[256+rdi]
-DB	102,72,15,126,194
+	movq	rdx,xmm0
 	lea	rbx,[((64+32+8))+rsp]
 
 	mov	r9,rdx
@@ -2531,7 +2531,7 @@
 	pshufd	xmm0,xmm4,0x4e
 	por	xmm0,xmm4
 	lea	rdi,[256+rdi]
-DB	102,72,15,126,194
+	movq	rdx,xmm0
 
 	mov	QWORD[rbx],rbp
 	lea	rbx,[32+rax*1+rbx]
@@ -2759,10 +2759,10 @@
 
 
 	pxor	xmm0,xmm0
-DB	102,72,15,110,207
-DB	102,72,15,110,209
-DB	102,73,15,110,218
-DB	102,72,15,110,226
+	movq	xmm1,rdi
+	movq	xmm2,rcx
+	movq	xmm3,r10
+	movq	xmm4,rdx
 	mov	QWORD[32+rsp],r8
 	mov	QWORD[40+rsp],rax
 
@@ -2781,8 +2781,8 @@
 
 	mov	r9,r10
 	mov	rdi,rsi
-DB	102,72,15,126,209
-DB	102,72,15,126,226
+	movq	rcx,xmm2
+	movq	rdx,xmm4
 	mov	rax,QWORD[40+rsp]
 
 	call	mulx4x_internal
@@ -3148,7 +3148,7 @@
 ALIGN	32
 $L$sqrx8x_outer_break:
 	mov	QWORD[72+rdi],r9
-DB	102,72,15,126,217
+	movq	rcx,xmm3
 	mov	QWORD[80+rdi],r10
 	mov	QWORD[88+rdi],r11
 	mov	QWORD[96+rdi],r12
@@ -3222,7 +3222,7 @@
 	mov	QWORD[48+rdi],rax
 	mov	QWORD[56+rdi],rbx
 	lea	rdi,[64+rdi]
-DB	102,72,15,126,213
+	movq	rbp,xmm2
 __bn_sqrx8x_reduction:
 	xor	eax,eax
 	mov	rbx,QWORD[((32+8))+rsp]
@@ -3402,10 +3402,10 @@
 	sub	rsi,QWORD[((16+8))+rsp]
 $L$sqrx8x_no_tail:
 	adc	r8,QWORD[rdi]
-DB	102,72,15,126,217
+	movq	rcx,xmm3
 	adc	r9,QWORD[8+rdi]
 	mov	rsi,QWORD[56+rbp]
-DB	102,72,15,126,213
+	movq	rbp,xmm2
 	adc	r10,QWORD[16+rdi]
 	adc	r11,QWORD[24+rdi]
 	adc	r12,QWORD[32+rdi]
@@ -3443,8 +3443,8 @@
 	neg	rax
 	sar	rcx,3+2
 
-DB	102,72,15,126,202
-DB	102,72,15,126,206
+	movq	rdx,xmm1
+	movq	rsi,xmm1
 	dec	r12
 	mov	r13,QWORD[8+rbp]
 	xor	r8,r8
diff --git a/gen/crypto/chacha-x86_64-apple.S b/gen/crypto/chacha-x86_64-apple.S
index d330661..09481e3 100644
--- a/gen/crypto/chacha-x86_64-apple.S
+++ b/gen/crypto/chacha-x86_64-apple.S
@@ -91,7 +91,7 @@
 	movq	%rbp,64+0(%rsp)
 	movl	$10,%ebp
 	movq	%rsi,64+8(%rsp)
-.byte	102,72,15,126,214
+	movq	%xmm2,%rsi
 	movq	%rdi,64+16(%rsp)
 	movq	%rsi,%rdi
 	shrq	$32,%rdi
@@ -354,7 +354,7 @@
 L$oop_ssse3:
 	paddd	%xmm1,%xmm0
 	pxor	%xmm0,%xmm3
-.byte	102,15,56,0,222
+	pshufb	%xmm6,%xmm3
 	paddd	%xmm3,%xmm2
 	pxor	%xmm2,%xmm1
 	movdqa	%xmm1,%xmm4
@@ -363,7 +363,7 @@
 	por	%xmm4,%xmm1
 	paddd	%xmm1,%xmm0
 	pxor	%xmm0,%xmm3
-.byte	102,15,56,0,223
+	pshufb	%xmm7,%xmm3
 	paddd	%xmm3,%xmm2
 	pxor	%xmm2,%xmm1
 	movdqa	%xmm1,%xmm4
@@ -376,7 +376,7 @@
 	nop
 	paddd	%xmm1,%xmm0
 	pxor	%xmm0,%xmm3
-.byte	102,15,56,0,222
+	pshufb	%xmm6,%xmm3
 	paddd	%xmm3,%xmm2
 	pxor	%xmm2,%xmm1
 	movdqa	%xmm1,%xmm4
@@ -385,7 +385,7 @@
 	por	%xmm4,%xmm1
 	paddd	%xmm1,%xmm0
 	pxor	%xmm0,%xmm3
-.byte	102,15,56,0,223
+	pshufb	%xmm7,%xmm3
 	paddd	%xmm3,%xmm2
 	pxor	%xmm2,%xmm1
 	movdqa	%xmm1,%xmm4
@@ -540,8 +540,8 @@
 	paddd	%xmm13,%xmm9
 	pxor	%xmm8,%xmm0
 	pxor	%xmm9,%xmm1
-.byte	102,15,56,0,199
-.byte	102,15,56,0,207
+	pshufb	%xmm7,%xmm0
+	pshufb	%xmm7,%xmm1
 	paddd	%xmm0,%xmm4
 	paddd	%xmm1,%xmm5
 	pxor	%xmm4,%xmm12
@@ -559,8 +559,8 @@
 	paddd	%xmm13,%xmm9
 	pxor	%xmm8,%xmm0
 	pxor	%xmm9,%xmm1
-.byte	102,15,56,0,198
-.byte	102,15,56,0,206
+	pshufb	%xmm6,%xmm0
+	pshufb	%xmm6,%xmm1
 	paddd	%xmm0,%xmm4
 	paddd	%xmm1,%xmm5
 	pxor	%xmm4,%xmm12
@@ -582,8 +582,8 @@
 	paddd	%xmm15,%xmm11
 	pxor	%xmm10,%xmm2
 	pxor	%xmm11,%xmm3
-.byte	102,15,56,0,215
-.byte	102,15,56,0,223
+	pshufb	%xmm7,%xmm2
+	pshufb	%xmm7,%xmm3
 	paddd	%xmm2,%xmm4
 	paddd	%xmm3,%xmm5
 	pxor	%xmm4,%xmm14
@@ -601,8 +601,8 @@
 	paddd	%xmm15,%xmm11
 	pxor	%xmm10,%xmm2
 	pxor	%xmm11,%xmm3
-.byte	102,15,56,0,214
-.byte	102,15,56,0,222
+	pshufb	%xmm6,%xmm2
+	pshufb	%xmm6,%xmm3
 	paddd	%xmm2,%xmm4
 	paddd	%xmm3,%xmm5
 	pxor	%xmm4,%xmm14
@@ -620,8 +620,8 @@
 	paddd	%xmm14,%xmm9
 	pxor	%xmm8,%xmm3
 	pxor	%xmm9,%xmm0
-.byte	102,15,56,0,223
-.byte	102,15,56,0,199
+	pshufb	%xmm7,%xmm3
+	pshufb	%xmm7,%xmm0
 	paddd	%xmm3,%xmm4
 	paddd	%xmm0,%xmm5
 	pxor	%xmm4,%xmm13
@@ -639,8 +639,8 @@
 	paddd	%xmm14,%xmm9
 	pxor	%xmm8,%xmm3
 	pxor	%xmm9,%xmm0
-.byte	102,15,56,0,222
-.byte	102,15,56,0,198
+	pshufb	%xmm6,%xmm3
+	pshufb	%xmm6,%xmm0
 	paddd	%xmm3,%xmm4
 	paddd	%xmm0,%xmm5
 	pxor	%xmm4,%xmm13
@@ -662,8 +662,8 @@
 	paddd	%xmm12,%xmm11
 	pxor	%xmm10,%xmm1
 	pxor	%xmm11,%xmm2
-.byte	102,15,56,0,207
-.byte	102,15,56,0,215
+	pshufb	%xmm7,%xmm1
+	pshufb	%xmm7,%xmm2
 	paddd	%xmm1,%xmm4
 	paddd	%xmm2,%xmm5
 	pxor	%xmm4,%xmm15
@@ -681,8 +681,8 @@
 	paddd	%xmm12,%xmm11
 	pxor	%xmm10,%xmm1
 	pxor	%xmm11,%xmm2
-.byte	102,15,56,0,206
-.byte	102,15,56,0,214
+	pshufb	%xmm6,%xmm1
+	pshufb	%xmm6,%xmm2
 	paddd	%xmm1,%xmm4
 	paddd	%xmm2,%xmm5
 	pxor	%xmm4,%xmm15
diff --git a/gen/crypto/chacha-x86_64-linux.S b/gen/crypto/chacha-x86_64-linux.S
index d76e6d0..8ea190d 100644
--- a/gen/crypto/chacha-x86_64-linux.S
+++ b/gen/crypto/chacha-x86_64-linux.S
@@ -97,7 +97,7 @@
 	movq	%rbp,64+0(%rsp)
 	movl	$10,%ebp
 	movq	%rsi,64+8(%rsp)
-.byte	102,72,15,126,214
+	movq	%xmm2,%rsi
 	movq	%rdi,64+16(%rsp)
 	movq	%rsi,%rdi
 	shrq	$32,%rdi
@@ -360,7 +360,7 @@
 .Loop_ssse3:
 	paddd	%xmm1,%xmm0
 	pxor	%xmm0,%xmm3
-.byte	102,15,56,0,222
+	pshufb	%xmm6,%xmm3
 	paddd	%xmm3,%xmm2
 	pxor	%xmm2,%xmm1
 	movdqa	%xmm1,%xmm4
@@ -369,7 +369,7 @@
 	por	%xmm4,%xmm1
 	paddd	%xmm1,%xmm0
 	pxor	%xmm0,%xmm3
-.byte	102,15,56,0,223
+	pshufb	%xmm7,%xmm3
 	paddd	%xmm3,%xmm2
 	pxor	%xmm2,%xmm1
 	movdqa	%xmm1,%xmm4
@@ -382,7 +382,7 @@
 	nop
 	paddd	%xmm1,%xmm0
 	pxor	%xmm0,%xmm3
-.byte	102,15,56,0,222
+	pshufb	%xmm6,%xmm3
 	paddd	%xmm3,%xmm2
 	pxor	%xmm2,%xmm1
 	movdqa	%xmm1,%xmm4
@@ -391,7 +391,7 @@
 	por	%xmm4,%xmm1
 	paddd	%xmm1,%xmm0
 	pxor	%xmm0,%xmm3
-.byte	102,15,56,0,223
+	pshufb	%xmm7,%xmm3
 	paddd	%xmm3,%xmm2
 	pxor	%xmm2,%xmm1
 	movdqa	%xmm1,%xmm4
@@ -546,8 +546,8 @@
 	paddd	%xmm13,%xmm9
 	pxor	%xmm8,%xmm0
 	pxor	%xmm9,%xmm1
-.byte	102,15,56,0,199
-.byte	102,15,56,0,207
+	pshufb	%xmm7,%xmm0
+	pshufb	%xmm7,%xmm1
 	paddd	%xmm0,%xmm4
 	paddd	%xmm1,%xmm5
 	pxor	%xmm4,%xmm12
@@ -565,8 +565,8 @@
 	paddd	%xmm13,%xmm9
 	pxor	%xmm8,%xmm0
 	pxor	%xmm9,%xmm1
-.byte	102,15,56,0,198
-.byte	102,15,56,0,206
+	pshufb	%xmm6,%xmm0
+	pshufb	%xmm6,%xmm1
 	paddd	%xmm0,%xmm4
 	paddd	%xmm1,%xmm5
 	pxor	%xmm4,%xmm12
@@ -588,8 +588,8 @@
 	paddd	%xmm15,%xmm11
 	pxor	%xmm10,%xmm2
 	pxor	%xmm11,%xmm3
-.byte	102,15,56,0,215
-.byte	102,15,56,0,223
+	pshufb	%xmm7,%xmm2
+	pshufb	%xmm7,%xmm3
 	paddd	%xmm2,%xmm4
 	paddd	%xmm3,%xmm5
 	pxor	%xmm4,%xmm14
@@ -607,8 +607,8 @@
 	paddd	%xmm15,%xmm11
 	pxor	%xmm10,%xmm2
 	pxor	%xmm11,%xmm3
-.byte	102,15,56,0,214
-.byte	102,15,56,0,222
+	pshufb	%xmm6,%xmm2
+	pshufb	%xmm6,%xmm3
 	paddd	%xmm2,%xmm4
 	paddd	%xmm3,%xmm5
 	pxor	%xmm4,%xmm14
@@ -626,8 +626,8 @@
 	paddd	%xmm14,%xmm9
 	pxor	%xmm8,%xmm3
 	pxor	%xmm9,%xmm0
-.byte	102,15,56,0,223
-.byte	102,15,56,0,199
+	pshufb	%xmm7,%xmm3
+	pshufb	%xmm7,%xmm0
 	paddd	%xmm3,%xmm4
 	paddd	%xmm0,%xmm5
 	pxor	%xmm4,%xmm13
@@ -645,8 +645,8 @@
 	paddd	%xmm14,%xmm9
 	pxor	%xmm8,%xmm3
 	pxor	%xmm9,%xmm0
-.byte	102,15,56,0,222
-.byte	102,15,56,0,198
+	pshufb	%xmm6,%xmm3
+	pshufb	%xmm6,%xmm0
 	paddd	%xmm3,%xmm4
 	paddd	%xmm0,%xmm5
 	pxor	%xmm4,%xmm13
@@ -668,8 +668,8 @@
 	paddd	%xmm12,%xmm11
 	pxor	%xmm10,%xmm1
 	pxor	%xmm11,%xmm2
-.byte	102,15,56,0,207
-.byte	102,15,56,0,215
+	pshufb	%xmm7,%xmm1
+	pshufb	%xmm7,%xmm2
 	paddd	%xmm1,%xmm4
 	paddd	%xmm2,%xmm5
 	pxor	%xmm4,%xmm15
@@ -687,8 +687,8 @@
 	paddd	%xmm12,%xmm11
 	pxor	%xmm10,%xmm1
 	pxor	%xmm11,%xmm2
-.byte	102,15,56,0,206
-.byte	102,15,56,0,214
+	pshufb	%xmm6,%xmm1
+	pshufb	%xmm6,%xmm2
 	paddd	%xmm1,%xmm4
 	paddd	%xmm2,%xmm5
 	pxor	%xmm4,%xmm15
diff --git a/gen/crypto/chacha-x86_64-win.asm b/gen/crypto/chacha-x86_64-win.asm
index f1f9a0d..f9cae3f 100644
--- a/gen/crypto/chacha-x86_64-win.asm
+++ b/gen/crypto/chacha-x86_64-win.asm
@@ -114,7 +114,7 @@
 	mov	QWORD[((64+0))+rsp],rbp
 	mov	ebp,10
 	mov	QWORD[((64+8))+rsp],rsi
-DB	102,72,15,126,214
+	movq	rsi,xmm2
 	mov	QWORD[((64+16))+rsp],rdi
 	mov	rdi,rsi
 	shr	rdi,32
@@ -392,7 +392,7 @@
 $L$oop_ssse3:
 	paddd	xmm0,xmm1
 	pxor	xmm3,xmm0
-DB	102,15,56,0,222
+	pshufb	xmm3,xmm6
 	paddd	xmm2,xmm3
 	pxor	xmm1,xmm2
 	movdqa	xmm4,xmm1
@@ -401,7 +401,7 @@
 	por	xmm1,xmm4
 	paddd	xmm0,xmm1
 	pxor	xmm3,xmm0
-DB	102,15,56,0,223
+	pshufb	xmm3,xmm7
 	paddd	xmm2,xmm3
 	pxor	xmm1,xmm2
 	movdqa	xmm4,xmm1
@@ -414,7 +414,7 @@
 	nop
 	paddd	xmm0,xmm1
 	pxor	xmm3,xmm0
-DB	102,15,56,0,222
+	pshufb	xmm3,xmm6
 	paddd	xmm2,xmm3
 	pxor	xmm1,xmm2
 	movdqa	xmm4,xmm1
@@ -423,7 +423,7 @@
 	por	xmm1,xmm4
 	paddd	xmm0,xmm1
 	pxor	xmm3,xmm0
-DB	102,15,56,0,223
+	pshufb	xmm3,xmm7
 	paddd	xmm2,xmm3
 	pxor	xmm1,xmm2
 	movdqa	xmm4,xmm1
@@ -603,8 +603,8 @@
 	paddd	xmm9,xmm13
 	pxor	xmm0,xmm8
 	pxor	xmm1,xmm9
-DB	102,15,56,0,199
-DB	102,15,56,0,207
+	pshufb	xmm0,xmm7
+	pshufb	xmm1,xmm7
 	paddd	xmm4,xmm0
 	paddd	xmm5,xmm1
 	pxor	xmm12,xmm4
@@ -622,8 +622,8 @@
 	paddd	xmm9,xmm13
 	pxor	xmm0,xmm8
 	pxor	xmm1,xmm9
-DB	102,15,56,0,198
-DB	102,15,56,0,206
+	pshufb	xmm0,xmm6
+	pshufb	xmm1,xmm6
 	paddd	xmm4,xmm0
 	paddd	xmm5,xmm1
 	pxor	xmm12,xmm4
@@ -645,8 +645,8 @@
 	paddd	xmm11,xmm15
 	pxor	xmm2,xmm10
 	pxor	xmm3,xmm11
-DB	102,15,56,0,215
-DB	102,15,56,0,223
+	pshufb	xmm2,xmm7
+	pshufb	xmm3,xmm7
 	paddd	xmm4,xmm2
 	paddd	xmm5,xmm3
 	pxor	xmm14,xmm4
@@ -664,8 +664,8 @@
 	paddd	xmm11,xmm15
 	pxor	xmm2,xmm10
 	pxor	xmm3,xmm11
-DB	102,15,56,0,214
-DB	102,15,56,0,222
+	pshufb	xmm2,xmm6
+	pshufb	xmm3,xmm6
 	paddd	xmm4,xmm2
 	paddd	xmm5,xmm3
 	pxor	xmm14,xmm4
@@ -683,8 +683,8 @@
 	paddd	xmm9,xmm14
 	pxor	xmm3,xmm8
 	pxor	xmm0,xmm9
-DB	102,15,56,0,223
-DB	102,15,56,0,199
+	pshufb	xmm3,xmm7
+	pshufb	xmm0,xmm7
 	paddd	xmm4,xmm3
 	paddd	xmm5,xmm0
 	pxor	xmm13,xmm4
@@ -702,8 +702,8 @@
 	paddd	xmm9,xmm14
 	pxor	xmm3,xmm8
 	pxor	xmm0,xmm9
-DB	102,15,56,0,222
-DB	102,15,56,0,198
+	pshufb	xmm3,xmm6
+	pshufb	xmm0,xmm6
 	paddd	xmm4,xmm3
 	paddd	xmm5,xmm0
 	pxor	xmm13,xmm4
@@ -725,8 +725,8 @@
 	paddd	xmm11,xmm12
 	pxor	xmm1,xmm10
 	pxor	xmm2,xmm11
-DB	102,15,56,0,207
-DB	102,15,56,0,215
+	pshufb	xmm1,xmm7
+	pshufb	xmm2,xmm7
 	paddd	xmm4,xmm1
 	paddd	xmm5,xmm2
 	pxor	xmm15,xmm4
@@ -744,8 +744,8 @@
 	paddd	xmm11,xmm12
 	pxor	xmm1,xmm10
 	pxor	xmm2,xmm11
-DB	102,15,56,0,206
-DB	102,15,56,0,214
+	pshufb	xmm1,xmm6
+	pshufb	xmm2,xmm6
 	paddd	xmm4,xmm1
 	paddd	xmm5,xmm2
 	pxor	xmm15,xmm4
diff --git a/gen/crypto/chacha20_poly1305_x86_64-apple.S b/gen/crypto/chacha20_poly1305_x86_64-apple.S
index a261463..4044212 100644
--- a/gen/crypto/chacha20_poly1305_x86_64-apple.S
+++ b/gen/crypto/chacha20_poly1305_x86_64-apple.S
@@ -278,9 +278,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
+	palignr	$4,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$12,%xmm12,%xmm12
 	paddd	%xmm4,%xmm0
 	pxor	%xmm0,%xmm12
 	pshufb	L$rol16(%rip),%xmm12
@@ -299,9 +299,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
+	palignr	$12,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$4,%xmm12,%xmm12
 
 	decq	%r10
 	jne	L$open_sse_init_rounds
@@ -359,10 +359,10 @@
 	pxor	%xmm2,%xmm14
 	pxor	%xmm1,%xmm13
 	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
+	pshufb	%xmm8,%xmm15
+	pshufb	%xmm8,%xmm14
+	pshufb	%xmm8,%xmm13
+	pshufb	%xmm8,%xmm12
 	movdqa	0+80(%rbp),%xmm8
 	paddd	%xmm15,%xmm11
 	paddd	%xmm14,%xmm10
@@ -413,10 +413,10 @@
 	pxor	%xmm2,%xmm14
 	pxor	%xmm1,%xmm13
 	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
+	pshufb	%xmm8,%xmm15
+	pshufb	%xmm8,%xmm14
+	pshufb	%xmm8,%xmm13
+	pshufb	%xmm8,%xmm12
 	movdqa	0+80(%rbp),%xmm8
 	paddd	%xmm15,%xmm11
 	paddd	%xmm14,%xmm10
@@ -457,18 +457,18 @@
 	imulq	%r12,%r9
 	addq	%r10,%r15
 	adcq	%rdx,%r9
-.byte	102,15,58,15,255,4
-.byte	102,69,15,58,15,219,8
-.byte	102,69,15,58,15,255,12
-.byte	102,15,58,15,246,4
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,12
-.byte	102,15,58,15,237,4
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,12
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
+	palignr	$4,%xmm7,%xmm7
+	palignr	$8,%xmm11,%xmm11
+	palignr	$12,%xmm15,%xmm15
+	palignr	$4,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$12,%xmm14,%xmm14
+	palignr	$4,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$12,%xmm13,%xmm13
+	palignr	$4,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$12,%xmm12,%xmm12
 	movdqa	%xmm8,0+80(%rbp)
 	movdqa	L$rol16(%rip),%xmm8
 	paddd	%xmm7,%xmm3
@@ -493,10 +493,10 @@
 	adcq	$0,%r12
 	pxor	%xmm1,%xmm13
 	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
+	pshufb	%xmm8,%xmm15
+	pshufb	%xmm8,%xmm14
+	pshufb	%xmm8,%xmm13
+	pshufb	%xmm8,%xmm12
 	movdqa	0+80(%rbp),%xmm8
 	paddd	%xmm15,%xmm11
 	paddd	%xmm14,%xmm10
@@ -532,10 +532,10 @@
 	pxor	%xmm2,%xmm14
 	pxor	%xmm1,%xmm13
 	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
+	pshufb	%xmm8,%xmm15
+	pshufb	%xmm8,%xmm14
+	pshufb	%xmm8,%xmm13
+	pshufb	%xmm8,%xmm12
 	movdqa	0+80(%rbp),%xmm8
 	paddd	%xmm15,%xmm11
 	paddd	%xmm14,%xmm10
@@ -563,18 +563,18 @@
 	pslld	$32-25,%xmm4
 	pxor	%xmm8,%xmm4
 	movdqa	0+80(%rbp),%xmm8
-.byte	102,15,58,15,255,12
-.byte	102,69,15,58,15,219,8
-.byte	102,69,15,58,15,255,4
-.byte	102,15,58,15,246,12
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,4
-.byte	102,15,58,15,237,12
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
+	palignr	$12,%xmm7,%xmm7
+	palignr	$8,%xmm11,%xmm11
+	palignr	$4,%xmm15,%xmm15
+	palignr	$12,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$4,%xmm14,%xmm14
+	palignr	$12,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$4,%xmm13,%xmm13
+	palignr	$12,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$4,%xmm12,%xmm12
 
 	decq	%rcx
 	jge	L$open_sse_main_loop_rounds
@@ -776,9 +776,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
+	palignr	$4,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$12,%xmm12,%xmm12
 	paddd	%xmm4,%xmm0
 	pxor	%xmm0,%xmm12
 	pshufb	L$rol16(%rip),%xmm12
@@ -797,9 +797,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
+	palignr	$12,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$4,%xmm12,%xmm12
 
 	cmpq	$16,%rcx
 	jae	L$open_sse_tail_64_rounds_and_x1hash
@@ -891,9 +891,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
+	palignr	$4,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$12,%xmm12,%xmm12
 	paddd	%xmm5,%xmm1
 	pxor	%xmm1,%xmm13
 	pshufb	L$rol16(%rip),%xmm13
@@ -912,9 +912,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm5
 	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,4
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,12
+	palignr	$4,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$12,%xmm13,%xmm13
 	paddd	%xmm4,%xmm0
 	pxor	%xmm0,%xmm12
 	pshufb	L$rol16(%rip),%xmm12
@@ -933,9 +933,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
+	palignr	$12,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$4,%xmm12,%xmm12
 	paddd	%xmm5,%xmm1
 	pxor	%xmm1,%xmm13
 	pshufb	L$rol16(%rip),%xmm13
@@ -954,9 +954,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm5
 	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,12
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,4
+	palignr	$12,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$4,%xmm13,%xmm13
 
 	cmpq	%rcx,%r8
 	jb	L$open_sse_tail_128_rounds_and_x1hash
@@ -1076,9 +1076,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
+	palignr	$4,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$12,%xmm12,%xmm12
 	paddd	%xmm5,%xmm1
 	pxor	%xmm1,%xmm13
 	pshufb	L$rol16(%rip),%xmm13
@@ -1097,9 +1097,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm5
 	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,4
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,12
+	palignr	$4,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$12,%xmm13,%xmm13
 	paddd	%xmm6,%xmm2
 	pxor	%xmm2,%xmm14
 	pshufb	L$rol16(%rip),%xmm14
@@ -1118,9 +1118,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm6
 	pxor	%xmm3,%xmm6
-.byte	102,15,58,15,246,4
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,12
+	palignr	$4,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$12,%xmm14,%xmm14
 	paddd	%xmm4,%xmm0
 	pxor	%xmm0,%xmm12
 	pshufb	L$rol16(%rip),%xmm12
@@ -1139,9 +1139,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
+	palignr	$12,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$4,%xmm12,%xmm12
 	paddd	%xmm5,%xmm1
 	pxor	%xmm1,%xmm13
 	pshufb	L$rol16(%rip),%xmm13
@@ -1160,9 +1160,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm5
 	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,12
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,4
+	palignr	$12,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$4,%xmm13,%xmm13
 	paddd	%xmm6,%xmm2
 	pxor	%xmm2,%xmm14
 	pshufb	L$rol16(%rip),%xmm14
@@ -1181,9 +1181,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm6
 	pxor	%xmm3,%xmm6
-.byte	102,15,58,15,246,12
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,4
+	palignr	$12,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$4,%xmm14,%xmm14
 
 	cmpq	%rcx,%r8
 	jb	L$open_sse_tail_192_rounds_and_x1hash
@@ -1368,9 +1368,9 @@
 	pslld	$7,%xmm11
 	psrld	$25,%xmm4
 	pxor	%xmm11,%xmm4
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
+	palignr	$4,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$12,%xmm12,%xmm12
 	paddd	%xmm5,%xmm1
 	pxor	%xmm1,%xmm13
 	pshufb	L$rol16(%rip),%xmm13
@@ -1389,9 +1389,9 @@
 	pslld	$7,%xmm11
 	psrld	$25,%xmm5
 	pxor	%xmm11,%xmm5
-.byte	102,15,58,15,237,4
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,12
+	palignr	$4,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$12,%xmm13,%xmm13
 	paddd	%xmm6,%xmm2
 	pxor	%xmm2,%xmm14
 	pshufb	L$rol16(%rip),%xmm14
@@ -1410,9 +1410,9 @@
 	pslld	$7,%xmm11
 	psrld	$25,%xmm6
 	pxor	%xmm11,%xmm6
-.byte	102,15,58,15,246,4
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,12
+	palignr	$4,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$12,%xmm14,%xmm14
 	movdqa	0+80(%rbp),%xmm11
 	movq	0+0+0(%rbp),%rax
 	movq	%rax,%r15
@@ -1443,9 +1443,9 @@
 	pslld	$7,%xmm9
 	psrld	$25,%xmm7
 	pxor	%xmm9,%xmm7
-.byte	102,15,58,15,255,4
-.byte	102,69,15,58,15,219,8
-.byte	102,69,15,58,15,255,12
+	palignr	$4,%xmm7,%xmm7
+	palignr	$8,%xmm11,%xmm11
+	palignr	$12,%xmm15,%xmm15
 	movdqa	0+80(%rbp),%xmm9
 	movq	8+0+0(%rbp),%rax
 	movq	%rax,%r9
@@ -1476,9 +1476,9 @@
 	pslld	$7,%xmm11
 	psrld	$25,%xmm4
 	pxor	%xmm11,%xmm4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
+	palignr	$12,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$4,%xmm12,%xmm12
 	paddd	%xmm5,%xmm1
 	pxor	%xmm1,%xmm13
 	pshufb	L$rol16(%rip),%xmm13
@@ -1497,9 +1497,9 @@
 	pslld	$7,%xmm11
 	psrld	$25,%xmm5
 	pxor	%xmm11,%xmm5
-.byte	102,15,58,15,237,12
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,4
+	palignr	$12,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$4,%xmm13,%xmm13
 	imulq	%r12,%r9
 	addq	%r10,%r15
 	adcq	%rdx,%r9
@@ -1521,9 +1521,9 @@
 	pslld	$7,%xmm11
 	psrld	$25,%xmm6
 	pxor	%xmm11,%xmm6
-.byte	102,15,58,15,246,12
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,4
+	palignr	$12,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$4,%xmm14,%xmm14
 	movdqa	0+80(%rbp),%xmm11
 	movq	%r13,%r10
 	movq	%r14,%r11
@@ -1558,9 +1558,9 @@
 	pslld	$7,%xmm9
 	psrld	$25,%xmm7
 	pxor	%xmm9,%xmm7
-.byte	102,15,58,15,255,12
-.byte	102,69,15,58,15,219,8
-.byte	102,69,15,58,15,255,4
+	palignr	$12,%xmm7,%xmm7
+	palignr	$8,%xmm11,%xmm11
+	palignr	$4,%xmm15,%xmm15
 	movdqa	0+80(%rbp),%xmm9
 
 	addq	$16,%r8
@@ -1707,7 +1707,7 @@
 	subq	$1,%r8
 	jnz	L$open_sse_tail_16_compose
 
-.byte	102,73,15,126,221
+	movq	%xmm3,%r13
 	pextrq	$1,%xmm3,%r14
 
 	pxor	%xmm1,%xmm3
@@ -1880,9 +1880,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
+	palignr	$4,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$12,%xmm12,%xmm12
 	paddd	%xmm5,%xmm1
 	pxor	%xmm1,%xmm13
 	pshufb	L$rol16(%rip),%xmm13
@@ -1901,9 +1901,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm5
 	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,4
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,12
+	palignr	$4,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$12,%xmm13,%xmm13
 	paddd	%xmm6,%xmm2
 	pxor	%xmm2,%xmm14
 	pshufb	L$rol16(%rip),%xmm14
@@ -1922,9 +1922,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm6
 	pxor	%xmm3,%xmm6
-.byte	102,15,58,15,246,4
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,12
+	palignr	$4,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$12,%xmm14,%xmm14
 	paddd	%xmm4,%xmm0
 	pxor	%xmm0,%xmm12
 	pshufb	L$rol16(%rip),%xmm12
@@ -1943,9 +1943,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
+	palignr	$12,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$4,%xmm12,%xmm12
 	paddd	%xmm5,%xmm1
 	pxor	%xmm1,%xmm13
 	pshufb	L$rol16(%rip),%xmm13
@@ -1964,9 +1964,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm5
 	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,12
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,4
+	palignr	$12,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$4,%xmm13,%xmm13
 	paddd	%xmm6,%xmm2
 	pxor	%xmm2,%xmm14
 	pshufb	L$rol16(%rip),%xmm14
@@ -1985,9 +1985,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm6
 	pxor	%xmm3,%xmm6
-.byte	102,15,58,15,246,12
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,4
+	palignr	$12,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$4,%xmm14,%xmm14
 
 	decq	%r10
 	jnz	L$open_sse_128_rounds
@@ -2155,10 +2155,10 @@
 	pxor	%xmm2,%xmm14
 	pxor	%xmm1,%xmm13
 	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
+	pshufb	%xmm8,%xmm15
+	pshufb	%xmm8,%xmm14
+	pshufb	%xmm8,%xmm13
+	pshufb	%xmm8,%xmm12
 	movdqa	0+80(%rbp),%xmm8
 	paddd	%xmm15,%xmm11
 	paddd	%xmm14,%xmm10
@@ -2194,10 +2194,10 @@
 	pxor	%xmm2,%xmm14
 	pxor	%xmm1,%xmm13
 	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
+	pshufb	%xmm8,%xmm15
+	pshufb	%xmm8,%xmm14
+	pshufb	%xmm8,%xmm13
+	pshufb	%xmm8,%xmm12
 	movdqa	0+80(%rbp),%xmm8
 	paddd	%xmm15,%xmm11
 	paddd	%xmm14,%xmm10
@@ -2225,18 +2225,18 @@
 	pslld	$32-25,%xmm4
 	pxor	%xmm8,%xmm4
 	movdqa	0+80(%rbp),%xmm8
-.byte	102,15,58,15,255,4
-.byte	102,69,15,58,15,219,8
-.byte	102,69,15,58,15,255,12
-.byte	102,15,58,15,246,4
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,12
-.byte	102,15,58,15,237,4
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,12
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
+	palignr	$4,%xmm7,%xmm7
+	palignr	$8,%xmm11,%xmm11
+	palignr	$12,%xmm15,%xmm15
+	palignr	$4,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$12,%xmm14,%xmm14
+	palignr	$4,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$12,%xmm13,%xmm13
+	palignr	$4,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$12,%xmm12,%xmm12
 	movdqa	%xmm8,0+80(%rbp)
 	movdqa	L$rol16(%rip),%xmm8
 	paddd	%xmm7,%xmm3
@@ -2247,10 +2247,10 @@
 	pxor	%xmm2,%xmm14
 	pxor	%xmm1,%xmm13
 	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
+	pshufb	%xmm8,%xmm15
+	pshufb	%xmm8,%xmm14
+	pshufb	%xmm8,%xmm13
+	pshufb	%xmm8,%xmm12
 	movdqa	0+80(%rbp),%xmm8
 	paddd	%xmm15,%xmm11
 	paddd	%xmm14,%xmm10
@@ -2286,10 +2286,10 @@
 	pxor	%xmm2,%xmm14
 	pxor	%xmm1,%xmm13
 	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
+	pshufb	%xmm8,%xmm15
+	pshufb	%xmm8,%xmm14
+	pshufb	%xmm8,%xmm13
+	pshufb	%xmm8,%xmm12
 	movdqa	0+80(%rbp),%xmm8
 	paddd	%xmm15,%xmm11
 	paddd	%xmm14,%xmm10
@@ -2317,18 +2317,18 @@
 	pslld	$32-25,%xmm4
 	pxor	%xmm8,%xmm4
 	movdqa	0+80(%rbp),%xmm8
-.byte	102,15,58,15,255,12
-.byte	102,69,15,58,15,219,8
-.byte	102,69,15,58,15,255,4
-.byte	102,15,58,15,246,12
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,4
-.byte	102,15,58,15,237,12
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
+	palignr	$12,%xmm7,%xmm7
+	palignr	$8,%xmm11,%xmm11
+	palignr	$4,%xmm15,%xmm15
+	palignr	$12,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$4,%xmm14,%xmm14
+	palignr	$12,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$4,%xmm13,%xmm13
+	palignr	$12,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$4,%xmm12,%xmm12
 
 	decq	%r10
 	jnz	L$seal_sse_init_rounds
@@ -2451,10 +2451,10 @@
 	pxor	%xmm2,%xmm14
 	pxor	%xmm1,%xmm13
 	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
+	pshufb	%xmm8,%xmm15
+	pshufb	%xmm8,%xmm14
+	pshufb	%xmm8,%xmm13
+	pshufb	%xmm8,%xmm12
 	movdqa	0+80(%rbp),%xmm8
 	paddd	%xmm15,%xmm11
 	paddd	%xmm14,%xmm10
@@ -2503,10 +2503,10 @@
 	pxor	%xmm2,%xmm14
 	pxor	%xmm1,%xmm13
 	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
+	pshufb	%xmm8,%xmm15
+	pshufb	%xmm8,%xmm14
+	pshufb	%xmm8,%xmm13
+	pshufb	%xmm8,%xmm12
 	movdqa	0+80(%rbp),%xmm8
 	paddd	%xmm15,%xmm11
 	paddd	%xmm14,%xmm10
@@ -2547,18 +2547,18 @@
 	imulq	%r12,%r9
 	addq	%r10,%r15
 	adcq	%rdx,%r9
-.byte	102,15,58,15,255,4
-.byte	102,69,15,58,15,219,8
-.byte	102,69,15,58,15,255,12
-.byte	102,15,58,15,246,4
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,12
-.byte	102,15,58,15,237,4
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,12
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
+	palignr	$4,%xmm7,%xmm7
+	palignr	$8,%xmm11,%xmm11
+	palignr	$12,%xmm15,%xmm15
+	palignr	$4,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$12,%xmm14,%xmm14
+	palignr	$4,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$12,%xmm13,%xmm13
+	palignr	$4,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$12,%xmm12,%xmm12
 	movdqa	%xmm8,0+80(%rbp)
 	movdqa	L$rol16(%rip),%xmm8
 	paddd	%xmm7,%xmm3
@@ -2583,10 +2583,10 @@
 	adcq	$0,%r12
 	pxor	%xmm1,%xmm13
 	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
+	pshufb	%xmm8,%xmm15
+	pshufb	%xmm8,%xmm14
+	pshufb	%xmm8,%xmm13
+	pshufb	%xmm8,%xmm12
 	movdqa	0+80(%rbp),%xmm8
 	paddd	%xmm15,%xmm11
 	paddd	%xmm14,%xmm10
@@ -2622,10 +2622,10 @@
 	pxor	%xmm2,%xmm14
 	pxor	%xmm1,%xmm13
 	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
+	pshufb	%xmm8,%xmm15
+	pshufb	%xmm8,%xmm14
+	pshufb	%xmm8,%xmm13
+	pshufb	%xmm8,%xmm12
 	movdqa	0+80(%rbp),%xmm8
 	paddd	%xmm15,%xmm11
 	paddd	%xmm14,%xmm10
@@ -2653,18 +2653,18 @@
 	pslld	$32-25,%xmm4
 	pxor	%xmm8,%xmm4
 	movdqa	0+80(%rbp),%xmm8
-.byte	102,15,58,15,255,12
-.byte	102,69,15,58,15,219,8
-.byte	102,69,15,58,15,255,4
-.byte	102,15,58,15,246,12
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,4
-.byte	102,15,58,15,237,12
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
+	palignr	$12,%xmm7,%xmm7
+	palignr	$8,%xmm11,%xmm11
+	palignr	$4,%xmm15,%xmm15
+	palignr	$12,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$4,%xmm14,%xmm14
+	palignr	$12,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$4,%xmm13,%xmm13
+	palignr	$12,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$4,%xmm12,%xmm12
 
 	leaq	16(%rdi),%rdi
 	decq	%r8
@@ -2877,9 +2877,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
+	palignr	$4,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$12,%xmm12,%xmm12
 	paddd	%xmm4,%xmm0
 	pxor	%xmm0,%xmm12
 	pshufb	L$rol16(%rip),%xmm12
@@ -2898,9 +2898,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
+	palignr	$12,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$4,%xmm12,%xmm12
 	addq	0+0(%rdi),%r10
 	adcq	8+0(%rdi),%r11
 	adcq	$1,%r12
@@ -3030,9 +3030,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
+	palignr	$4,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$12,%xmm12,%xmm12
 	paddd	%xmm5,%xmm1
 	pxor	%xmm1,%xmm13
 	pshufb	L$rol16(%rip),%xmm13
@@ -3051,9 +3051,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm5
 	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,4
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,12
+	palignr	$4,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$12,%xmm13,%xmm13
 	addq	0+0(%rdi),%r10
 	adcq	8+0(%rdi),%r11
 	adcq	$1,%r12
@@ -3112,9 +3112,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
+	palignr	$12,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$4,%xmm12,%xmm12
 	paddd	%xmm5,%xmm1
 	pxor	%xmm1,%xmm13
 	pshufb	L$rol16(%rip),%xmm13
@@ -3133,9 +3133,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm5
 	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,12
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,4
+	palignr	$12,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$4,%xmm13,%xmm13
 
 	leaq	16(%rdi),%rdi
 	decq	%rcx
@@ -3250,9 +3250,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
+	palignr	$4,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$12,%xmm12,%xmm12
 	paddd	%xmm5,%xmm1
 	pxor	%xmm1,%xmm13
 	pshufb	L$rol16(%rip),%xmm13
@@ -3271,9 +3271,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm5
 	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,4
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,12
+	palignr	$4,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$12,%xmm13,%xmm13
 	paddd	%xmm6,%xmm2
 	pxor	%xmm2,%xmm14
 	pshufb	L$rol16(%rip),%xmm14
@@ -3292,9 +3292,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm6
 	pxor	%xmm3,%xmm6
-.byte	102,15,58,15,246,4
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,12
+	palignr	$4,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$12,%xmm14,%xmm14
 	addq	0+0(%rdi),%r10
 	adcq	8+0(%rdi),%r11
 	adcq	$1,%r12
@@ -3353,9 +3353,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
+	palignr	$12,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$4,%xmm12,%xmm12
 	paddd	%xmm5,%xmm1
 	pxor	%xmm1,%xmm13
 	pshufb	L$rol16(%rip),%xmm13
@@ -3374,9 +3374,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm5
 	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,12
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,4
+	palignr	$12,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$4,%xmm13,%xmm13
 	paddd	%xmm6,%xmm2
 	pxor	%xmm2,%xmm14
 	pshufb	L$rol16(%rip),%xmm14
@@ -3395,9 +3395,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm6
 	pxor	%xmm3,%xmm6
-.byte	102,15,58,15,246,12
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,4
+	palignr	$12,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$4,%xmm14,%xmm14
 
 	leaq	16(%rdi),%rdi
 	decq	%rcx
@@ -3649,7 +3649,7 @@
 
 
 
-.byte	102,77,15,126,253
+	movq	%xmm15,%r13
 	pextrq	$1,%xmm15,%r14
 	addq	%r13,%r10
 	adcq	%r14,%r11
@@ -3765,7 +3765,7 @@
 	leaq	L$and_masks(%rip),%r15
 	shlq	$4,%rbx
 	pand	-16(%r15,%rbx,1),%xmm15
-.byte	102,77,15,126,253
+	movq	%xmm15,%r13
 	pextrq	$1,%xmm15,%r14
 	addq	%r13,%r10
 	adcq	%r14,%r11
@@ -3927,9 +3927,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
+	palignr	$4,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$12,%xmm12,%xmm12
 	paddd	%xmm5,%xmm1
 	pxor	%xmm1,%xmm13
 	pshufb	L$rol16(%rip),%xmm13
@@ -3948,9 +3948,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm5
 	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,4
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,12
+	palignr	$4,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$12,%xmm13,%xmm13
 	paddd	%xmm6,%xmm2
 	pxor	%xmm2,%xmm14
 	pshufb	L$rol16(%rip),%xmm14
@@ -3969,9 +3969,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm6
 	pxor	%xmm3,%xmm6
-.byte	102,15,58,15,246,4
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,12
+	palignr	$4,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$12,%xmm14,%xmm14
 	paddd	%xmm4,%xmm0
 	pxor	%xmm0,%xmm12
 	pshufb	L$rol16(%rip),%xmm12
@@ -3990,9 +3990,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
+	palignr	$12,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$4,%xmm12,%xmm12
 	paddd	%xmm5,%xmm1
 	pxor	%xmm1,%xmm13
 	pshufb	L$rol16(%rip),%xmm13
@@ -4011,9 +4011,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm5
 	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,12
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,4
+	palignr	$12,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$4,%xmm13,%xmm13
 	paddd	%xmm6,%xmm2
 	pxor	%xmm2,%xmm14
 	pshufb	L$rol16(%rip),%xmm14
@@ -4032,9 +4032,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm6
 	pxor	%xmm3,%xmm6
-.byte	102,15,58,15,246,12
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,4
+	palignr	$12,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$4,%xmm14,%xmm14
 
 	decq	%r10
 	jnz	L$seal_sse_128_rounds
diff --git a/gen/crypto/chacha20_poly1305_x86_64-linux.S b/gen/crypto/chacha20_poly1305_x86_64-linux.S
index 180b41e..6fd94c8 100644
--- a/gen/crypto/chacha20_poly1305_x86_64-linux.S
+++ b/gen/crypto/chacha20_poly1305_x86_64-linux.S
@@ -285,9 +285,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
+	palignr	$4,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$12,%xmm12,%xmm12
 	paddd	%xmm4,%xmm0
 	pxor	%xmm0,%xmm12
 	pshufb	.Lrol16(%rip),%xmm12
@@ -306,9 +306,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
+	palignr	$12,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$4,%xmm12,%xmm12
 
 	decq	%r10
 	jne	.Lopen_sse_init_rounds
@@ -366,10 +366,10 @@
 	pxor	%xmm2,%xmm14
 	pxor	%xmm1,%xmm13
 	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
+	pshufb	%xmm8,%xmm15
+	pshufb	%xmm8,%xmm14
+	pshufb	%xmm8,%xmm13
+	pshufb	%xmm8,%xmm12
 	movdqa	0+80(%rbp),%xmm8
 	paddd	%xmm15,%xmm11
 	paddd	%xmm14,%xmm10
@@ -420,10 +420,10 @@
 	pxor	%xmm2,%xmm14
 	pxor	%xmm1,%xmm13
 	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
+	pshufb	%xmm8,%xmm15
+	pshufb	%xmm8,%xmm14
+	pshufb	%xmm8,%xmm13
+	pshufb	%xmm8,%xmm12
 	movdqa	0+80(%rbp),%xmm8
 	paddd	%xmm15,%xmm11
 	paddd	%xmm14,%xmm10
@@ -464,18 +464,18 @@
 	imulq	%r12,%r9
 	addq	%r10,%r15
 	adcq	%rdx,%r9
-.byte	102,15,58,15,255,4
-.byte	102,69,15,58,15,219,8
-.byte	102,69,15,58,15,255,12
-.byte	102,15,58,15,246,4
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,12
-.byte	102,15,58,15,237,4
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,12
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
+	palignr	$4,%xmm7,%xmm7
+	palignr	$8,%xmm11,%xmm11
+	palignr	$12,%xmm15,%xmm15
+	palignr	$4,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$12,%xmm14,%xmm14
+	palignr	$4,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$12,%xmm13,%xmm13
+	palignr	$4,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$12,%xmm12,%xmm12
 	movdqa	%xmm8,0+80(%rbp)
 	movdqa	.Lrol16(%rip),%xmm8
 	paddd	%xmm7,%xmm3
@@ -500,10 +500,10 @@
 	adcq	$0,%r12
 	pxor	%xmm1,%xmm13
 	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
+	pshufb	%xmm8,%xmm15
+	pshufb	%xmm8,%xmm14
+	pshufb	%xmm8,%xmm13
+	pshufb	%xmm8,%xmm12
 	movdqa	0+80(%rbp),%xmm8
 	paddd	%xmm15,%xmm11
 	paddd	%xmm14,%xmm10
@@ -539,10 +539,10 @@
 	pxor	%xmm2,%xmm14
 	pxor	%xmm1,%xmm13
 	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
+	pshufb	%xmm8,%xmm15
+	pshufb	%xmm8,%xmm14
+	pshufb	%xmm8,%xmm13
+	pshufb	%xmm8,%xmm12
 	movdqa	0+80(%rbp),%xmm8
 	paddd	%xmm15,%xmm11
 	paddd	%xmm14,%xmm10
@@ -570,18 +570,18 @@
 	pslld	$32-25,%xmm4
 	pxor	%xmm8,%xmm4
 	movdqa	0+80(%rbp),%xmm8
-.byte	102,15,58,15,255,12
-.byte	102,69,15,58,15,219,8
-.byte	102,69,15,58,15,255,4
-.byte	102,15,58,15,246,12
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,4
-.byte	102,15,58,15,237,12
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
+	palignr	$12,%xmm7,%xmm7
+	palignr	$8,%xmm11,%xmm11
+	palignr	$4,%xmm15,%xmm15
+	palignr	$12,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$4,%xmm14,%xmm14
+	palignr	$12,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$4,%xmm13,%xmm13
+	palignr	$12,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$4,%xmm12,%xmm12
 
 	decq	%rcx
 	jge	.Lopen_sse_main_loop_rounds
@@ -783,9 +783,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
+	palignr	$4,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$12,%xmm12,%xmm12
 	paddd	%xmm4,%xmm0
 	pxor	%xmm0,%xmm12
 	pshufb	.Lrol16(%rip),%xmm12
@@ -804,9 +804,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
+	palignr	$12,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$4,%xmm12,%xmm12
 
 	cmpq	$16,%rcx
 	jae	.Lopen_sse_tail_64_rounds_and_x1hash
@@ -898,9 +898,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
+	palignr	$4,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$12,%xmm12,%xmm12
 	paddd	%xmm5,%xmm1
 	pxor	%xmm1,%xmm13
 	pshufb	.Lrol16(%rip),%xmm13
@@ -919,9 +919,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm5
 	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,4
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,12
+	palignr	$4,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$12,%xmm13,%xmm13
 	paddd	%xmm4,%xmm0
 	pxor	%xmm0,%xmm12
 	pshufb	.Lrol16(%rip),%xmm12
@@ -940,9 +940,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
+	palignr	$12,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$4,%xmm12,%xmm12
 	paddd	%xmm5,%xmm1
 	pxor	%xmm1,%xmm13
 	pshufb	.Lrol16(%rip),%xmm13
@@ -961,9 +961,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm5
 	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,12
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,4
+	palignr	$12,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$4,%xmm13,%xmm13
 
 	cmpq	%rcx,%r8
 	jb	.Lopen_sse_tail_128_rounds_and_x1hash
@@ -1083,9 +1083,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
+	palignr	$4,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$12,%xmm12,%xmm12
 	paddd	%xmm5,%xmm1
 	pxor	%xmm1,%xmm13
 	pshufb	.Lrol16(%rip),%xmm13
@@ -1104,9 +1104,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm5
 	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,4
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,12
+	palignr	$4,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$12,%xmm13,%xmm13
 	paddd	%xmm6,%xmm2
 	pxor	%xmm2,%xmm14
 	pshufb	.Lrol16(%rip),%xmm14
@@ -1125,9 +1125,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm6
 	pxor	%xmm3,%xmm6
-.byte	102,15,58,15,246,4
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,12
+	palignr	$4,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$12,%xmm14,%xmm14
 	paddd	%xmm4,%xmm0
 	pxor	%xmm0,%xmm12
 	pshufb	.Lrol16(%rip),%xmm12
@@ -1146,9 +1146,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
+	palignr	$12,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$4,%xmm12,%xmm12
 	paddd	%xmm5,%xmm1
 	pxor	%xmm1,%xmm13
 	pshufb	.Lrol16(%rip),%xmm13
@@ -1167,9 +1167,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm5
 	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,12
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,4
+	palignr	$12,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$4,%xmm13,%xmm13
 	paddd	%xmm6,%xmm2
 	pxor	%xmm2,%xmm14
 	pshufb	.Lrol16(%rip),%xmm14
@@ -1188,9 +1188,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm6
 	pxor	%xmm3,%xmm6
-.byte	102,15,58,15,246,12
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,4
+	palignr	$12,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$4,%xmm14,%xmm14
 
 	cmpq	%rcx,%r8
 	jb	.Lopen_sse_tail_192_rounds_and_x1hash
@@ -1375,9 +1375,9 @@
 	pslld	$7,%xmm11
 	psrld	$25,%xmm4
 	pxor	%xmm11,%xmm4
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
+	palignr	$4,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$12,%xmm12,%xmm12
 	paddd	%xmm5,%xmm1
 	pxor	%xmm1,%xmm13
 	pshufb	.Lrol16(%rip),%xmm13
@@ -1396,9 +1396,9 @@
 	pslld	$7,%xmm11
 	psrld	$25,%xmm5
 	pxor	%xmm11,%xmm5
-.byte	102,15,58,15,237,4
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,12
+	palignr	$4,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$12,%xmm13,%xmm13
 	paddd	%xmm6,%xmm2
 	pxor	%xmm2,%xmm14
 	pshufb	.Lrol16(%rip),%xmm14
@@ -1417,9 +1417,9 @@
 	pslld	$7,%xmm11
 	psrld	$25,%xmm6
 	pxor	%xmm11,%xmm6
-.byte	102,15,58,15,246,4
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,12
+	palignr	$4,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$12,%xmm14,%xmm14
 	movdqa	0+80(%rbp),%xmm11
 	movq	0+0+0(%rbp),%rax
 	movq	%rax,%r15
@@ -1450,9 +1450,9 @@
 	pslld	$7,%xmm9
 	psrld	$25,%xmm7
 	pxor	%xmm9,%xmm7
-.byte	102,15,58,15,255,4
-.byte	102,69,15,58,15,219,8
-.byte	102,69,15,58,15,255,12
+	palignr	$4,%xmm7,%xmm7
+	palignr	$8,%xmm11,%xmm11
+	palignr	$12,%xmm15,%xmm15
 	movdqa	0+80(%rbp),%xmm9
 	movq	8+0+0(%rbp),%rax
 	movq	%rax,%r9
@@ -1483,9 +1483,9 @@
 	pslld	$7,%xmm11
 	psrld	$25,%xmm4
 	pxor	%xmm11,%xmm4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
+	palignr	$12,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$4,%xmm12,%xmm12
 	paddd	%xmm5,%xmm1
 	pxor	%xmm1,%xmm13
 	pshufb	.Lrol16(%rip),%xmm13
@@ -1504,9 +1504,9 @@
 	pslld	$7,%xmm11
 	psrld	$25,%xmm5
 	pxor	%xmm11,%xmm5
-.byte	102,15,58,15,237,12
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,4
+	palignr	$12,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$4,%xmm13,%xmm13
 	imulq	%r12,%r9
 	addq	%r10,%r15
 	adcq	%rdx,%r9
@@ -1528,9 +1528,9 @@
 	pslld	$7,%xmm11
 	psrld	$25,%xmm6
 	pxor	%xmm11,%xmm6
-.byte	102,15,58,15,246,12
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,4
+	palignr	$12,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$4,%xmm14,%xmm14
 	movdqa	0+80(%rbp),%xmm11
 	movq	%r13,%r10
 	movq	%r14,%r11
@@ -1565,9 +1565,9 @@
 	pslld	$7,%xmm9
 	psrld	$25,%xmm7
 	pxor	%xmm9,%xmm7
-.byte	102,15,58,15,255,12
-.byte	102,69,15,58,15,219,8
-.byte	102,69,15,58,15,255,4
+	palignr	$12,%xmm7,%xmm7
+	palignr	$8,%xmm11,%xmm11
+	palignr	$4,%xmm15,%xmm15
 	movdqa	0+80(%rbp),%xmm9
 
 	addq	$16,%r8
@@ -1714,7 +1714,7 @@
 	subq	$1,%r8
 	jnz	.Lopen_sse_tail_16_compose
 
-.byte	102,73,15,126,221
+	movq	%xmm3,%r13
 	pextrq	$1,%xmm3,%r14
 
 	pxor	%xmm1,%xmm3
@@ -1894,9 +1894,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
+	palignr	$4,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$12,%xmm12,%xmm12
 	paddd	%xmm5,%xmm1
 	pxor	%xmm1,%xmm13
 	pshufb	.Lrol16(%rip),%xmm13
@@ -1915,9 +1915,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm5
 	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,4
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,12
+	palignr	$4,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$12,%xmm13,%xmm13
 	paddd	%xmm6,%xmm2
 	pxor	%xmm2,%xmm14
 	pshufb	.Lrol16(%rip),%xmm14
@@ -1936,9 +1936,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm6
 	pxor	%xmm3,%xmm6
-.byte	102,15,58,15,246,4
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,12
+	palignr	$4,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$12,%xmm14,%xmm14
 	paddd	%xmm4,%xmm0
 	pxor	%xmm0,%xmm12
 	pshufb	.Lrol16(%rip),%xmm12
@@ -1957,9 +1957,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
+	palignr	$12,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$4,%xmm12,%xmm12
 	paddd	%xmm5,%xmm1
 	pxor	%xmm1,%xmm13
 	pshufb	.Lrol16(%rip),%xmm13
@@ -1978,9 +1978,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm5
 	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,12
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,4
+	palignr	$12,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$4,%xmm13,%xmm13
 	paddd	%xmm6,%xmm2
 	pxor	%xmm2,%xmm14
 	pshufb	.Lrol16(%rip),%xmm14
@@ -1999,9 +1999,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm6
 	pxor	%xmm3,%xmm6
-.byte	102,15,58,15,246,12
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,4
+	palignr	$12,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$4,%xmm14,%xmm14
 
 	decq	%r10
 	jnz	.Lopen_sse_128_rounds
@@ -2176,10 +2176,10 @@
 	pxor	%xmm2,%xmm14
 	pxor	%xmm1,%xmm13
 	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
+	pshufb	%xmm8,%xmm15
+	pshufb	%xmm8,%xmm14
+	pshufb	%xmm8,%xmm13
+	pshufb	%xmm8,%xmm12
 	movdqa	0+80(%rbp),%xmm8
 	paddd	%xmm15,%xmm11
 	paddd	%xmm14,%xmm10
@@ -2215,10 +2215,10 @@
 	pxor	%xmm2,%xmm14
 	pxor	%xmm1,%xmm13
 	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
+	pshufb	%xmm8,%xmm15
+	pshufb	%xmm8,%xmm14
+	pshufb	%xmm8,%xmm13
+	pshufb	%xmm8,%xmm12
 	movdqa	0+80(%rbp),%xmm8
 	paddd	%xmm15,%xmm11
 	paddd	%xmm14,%xmm10
@@ -2246,18 +2246,18 @@
 	pslld	$32-25,%xmm4
 	pxor	%xmm8,%xmm4
 	movdqa	0+80(%rbp),%xmm8
-.byte	102,15,58,15,255,4
-.byte	102,69,15,58,15,219,8
-.byte	102,69,15,58,15,255,12
-.byte	102,15,58,15,246,4
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,12
-.byte	102,15,58,15,237,4
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,12
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
+	palignr	$4,%xmm7,%xmm7
+	palignr	$8,%xmm11,%xmm11
+	palignr	$12,%xmm15,%xmm15
+	palignr	$4,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$12,%xmm14,%xmm14
+	palignr	$4,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$12,%xmm13,%xmm13
+	palignr	$4,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$12,%xmm12,%xmm12
 	movdqa	%xmm8,0+80(%rbp)
 	movdqa	.Lrol16(%rip),%xmm8
 	paddd	%xmm7,%xmm3
@@ -2268,10 +2268,10 @@
 	pxor	%xmm2,%xmm14
 	pxor	%xmm1,%xmm13
 	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
+	pshufb	%xmm8,%xmm15
+	pshufb	%xmm8,%xmm14
+	pshufb	%xmm8,%xmm13
+	pshufb	%xmm8,%xmm12
 	movdqa	0+80(%rbp),%xmm8
 	paddd	%xmm15,%xmm11
 	paddd	%xmm14,%xmm10
@@ -2307,10 +2307,10 @@
 	pxor	%xmm2,%xmm14
 	pxor	%xmm1,%xmm13
 	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
+	pshufb	%xmm8,%xmm15
+	pshufb	%xmm8,%xmm14
+	pshufb	%xmm8,%xmm13
+	pshufb	%xmm8,%xmm12
 	movdqa	0+80(%rbp),%xmm8
 	paddd	%xmm15,%xmm11
 	paddd	%xmm14,%xmm10
@@ -2338,18 +2338,18 @@
 	pslld	$32-25,%xmm4
 	pxor	%xmm8,%xmm4
 	movdqa	0+80(%rbp),%xmm8
-.byte	102,15,58,15,255,12
-.byte	102,69,15,58,15,219,8
-.byte	102,69,15,58,15,255,4
-.byte	102,15,58,15,246,12
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,4
-.byte	102,15,58,15,237,12
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
+	palignr	$12,%xmm7,%xmm7
+	palignr	$8,%xmm11,%xmm11
+	palignr	$4,%xmm15,%xmm15
+	palignr	$12,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$4,%xmm14,%xmm14
+	palignr	$12,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$4,%xmm13,%xmm13
+	palignr	$12,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$4,%xmm12,%xmm12
 
 	decq	%r10
 	jnz	.Lseal_sse_init_rounds
@@ -2472,10 +2472,10 @@
 	pxor	%xmm2,%xmm14
 	pxor	%xmm1,%xmm13
 	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
+	pshufb	%xmm8,%xmm15
+	pshufb	%xmm8,%xmm14
+	pshufb	%xmm8,%xmm13
+	pshufb	%xmm8,%xmm12
 	movdqa	0+80(%rbp),%xmm8
 	paddd	%xmm15,%xmm11
 	paddd	%xmm14,%xmm10
@@ -2524,10 +2524,10 @@
 	pxor	%xmm2,%xmm14
 	pxor	%xmm1,%xmm13
 	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
+	pshufb	%xmm8,%xmm15
+	pshufb	%xmm8,%xmm14
+	pshufb	%xmm8,%xmm13
+	pshufb	%xmm8,%xmm12
 	movdqa	0+80(%rbp),%xmm8
 	paddd	%xmm15,%xmm11
 	paddd	%xmm14,%xmm10
@@ -2568,18 +2568,18 @@
 	imulq	%r12,%r9
 	addq	%r10,%r15
 	adcq	%rdx,%r9
-.byte	102,15,58,15,255,4
-.byte	102,69,15,58,15,219,8
-.byte	102,69,15,58,15,255,12
-.byte	102,15,58,15,246,4
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,12
-.byte	102,15,58,15,237,4
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,12
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
+	palignr	$4,%xmm7,%xmm7
+	palignr	$8,%xmm11,%xmm11
+	palignr	$12,%xmm15,%xmm15
+	palignr	$4,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$12,%xmm14,%xmm14
+	palignr	$4,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$12,%xmm13,%xmm13
+	palignr	$4,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$12,%xmm12,%xmm12
 	movdqa	%xmm8,0+80(%rbp)
 	movdqa	.Lrol16(%rip),%xmm8
 	paddd	%xmm7,%xmm3
@@ -2604,10 +2604,10 @@
 	adcq	$0,%r12
 	pxor	%xmm1,%xmm13
 	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
+	pshufb	%xmm8,%xmm15
+	pshufb	%xmm8,%xmm14
+	pshufb	%xmm8,%xmm13
+	pshufb	%xmm8,%xmm12
 	movdqa	0+80(%rbp),%xmm8
 	paddd	%xmm15,%xmm11
 	paddd	%xmm14,%xmm10
@@ -2643,10 +2643,10 @@
 	pxor	%xmm2,%xmm14
 	pxor	%xmm1,%xmm13
 	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
+	pshufb	%xmm8,%xmm15
+	pshufb	%xmm8,%xmm14
+	pshufb	%xmm8,%xmm13
+	pshufb	%xmm8,%xmm12
 	movdqa	0+80(%rbp),%xmm8
 	paddd	%xmm15,%xmm11
 	paddd	%xmm14,%xmm10
@@ -2674,18 +2674,18 @@
 	pslld	$32-25,%xmm4
 	pxor	%xmm8,%xmm4
 	movdqa	0+80(%rbp),%xmm8
-.byte	102,15,58,15,255,12
-.byte	102,69,15,58,15,219,8
-.byte	102,69,15,58,15,255,4
-.byte	102,15,58,15,246,12
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,4
-.byte	102,15,58,15,237,12
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
+	palignr	$12,%xmm7,%xmm7
+	palignr	$8,%xmm11,%xmm11
+	palignr	$4,%xmm15,%xmm15
+	palignr	$12,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$4,%xmm14,%xmm14
+	palignr	$12,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$4,%xmm13,%xmm13
+	palignr	$12,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$4,%xmm12,%xmm12
 
 	leaq	16(%rdi),%rdi
 	decq	%r8
@@ -2898,9 +2898,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
+	palignr	$4,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$12,%xmm12,%xmm12
 	paddd	%xmm4,%xmm0
 	pxor	%xmm0,%xmm12
 	pshufb	.Lrol16(%rip),%xmm12
@@ -2919,9 +2919,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
+	palignr	$12,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$4,%xmm12,%xmm12
 	addq	0+0(%rdi),%r10
 	adcq	8+0(%rdi),%r11
 	adcq	$1,%r12
@@ -3051,9 +3051,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
+	palignr	$4,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$12,%xmm12,%xmm12
 	paddd	%xmm5,%xmm1
 	pxor	%xmm1,%xmm13
 	pshufb	.Lrol16(%rip),%xmm13
@@ -3072,9 +3072,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm5
 	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,4
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,12
+	palignr	$4,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$12,%xmm13,%xmm13
 	addq	0+0(%rdi),%r10
 	adcq	8+0(%rdi),%r11
 	adcq	$1,%r12
@@ -3133,9 +3133,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
+	palignr	$12,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$4,%xmm12,%xmm12
 	paddd	%xmm5,%xmm1
 	pxor	%xmm1,%xmm13
 	pshufb	.Lrol16(%rip),%xmm13
@@ -3154,9 +3154,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm5
 	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,12
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,4
+	palignr	$12,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$4,%xmm13,%xmm13
 
 	leaq	16(%rdi),%rdi
 	decq	%rcx
@@ -3271,9 +3271,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
+	palignr	$4,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$12,%xmm12,%xmm12
 	paddd	%xmm5,%xmm1
 	pxor	%xmm1,%xmm13
 	pshufb	.Lrol16(%rip),%xmm13
@@ -3292,9 +3292,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm5
 	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,4
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,12
+	palignr	$4,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$12,%xmm13,%xmm13
 	paddd	%xmm6,%xmm2
 	pxor	%xmm2,%xmm14
 	pshufb	.Lrol16(%rip),%xmm14
@@ -3313,9 +3313,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm6
 	pxor	%xmm3,%xmm6
-.byte	102,15,58,15,246,4
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,12
+	palignr	$4,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$12,%xmm14,%xmm14
 	addq	0+0(%rdi),%r10
 	adcq	8+0(%rdi),%r11
 	adcq	$1,%r12
@@ -3374,9 +3374,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
+	palignr	$12,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$4,%xmm12,%xmm12
 	paddd	%xmm5,%xmm1
 	pxor	%xmm1,%xmm13
 	pshufb	.Lrol16(%rip),%xmm13
@@ -3395,9 +3395,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm5
 	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,12
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,4
+	palignr	$12,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$4,%xmm13,%xmm13
 	paddd	%xmm6,%xmm2
 	pxor	%xmm2,%xmm14
 	pshufb	.Lrol16(%rip),%xmm14
@@ -3416,9 +3416,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm6
 	pxor	%xmm3,%xmm6
-.byte	102,15,58,15,246,12
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,4
+	palignr	$12,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$4,%xmm14,%xmm14
 
 	leaq	16(%rdi),%rdi
 	decq	%rcx
@@ -3670,7 +3670,7 @@
 
 
 
-.byte	102,77,15,126,253
+	movq	%xmm15,%r13
 	pextrq	$1,%xmm15,%r14
 	addq	%r13,%r10
 	adcq	%r14,%r11
@@ -3786,7 +3786,7 @@
 	leaq	.Land_masks(%rip),%r15
 	shlq	$4,%rbx
 	pand	-16(%r15,%rbx,1),%xmm15
-.byte	102,77,15,126,253
+	movq	%xmm15,%r13
 	pextrq	$1,%xmm15,%r14
 	addq	%r13,%r10
 	adcq	%r14,%r11
@@ -3955,9 +3955,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
+	palignr	$4,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$12,%xmm12,%xmm12
 	paddd	%xmm5,%xmm1
 	pxor	%xmm1,%xmm13
 	pshufb	.Lrol16(%rip),%xmm13
@@ -3976,9 +3976,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm5
 	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,4
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,12
+	palignr	$4,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$12,%xmm13,%xmm13
 	paddd	%xmm6,%xmm2
 	pxor	%xmm2,%xmm14
 	pshufb	.Lrol16(%rip),%xmm14
@@ -3997,9 +3997,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm6
 	pxor	%xmm3,%xmm6
-.byte	102,15,58,15,246,4
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,12
+	palignr	$4,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$12,%xmm14,%xmm14
 	paddd	%xmm4,%xmm0
 	pxor	%xmm0,%xmm12
 	pshufb	.Lrol16(%rip),%xmm12
@@ -4018,9 +4018,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm4
 	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
+	palignr	$12,%xmm4,%xmm4
+	palignr	$8,%xmm8,%xmm8
+	palignr	$4,%xmm12,%xmm12
 	paddd	%xmm5,%xmm1
 	pxor	%xmm1,%xmm13
 	pshufb	.Lrol16(%rip),%xmm13
@@ -4039,9 +4039,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm5
 	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,12
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,4
+	palignr	$12,%xmm5,%xmm5
+	palignr	$8,%xmm9,%xmm9
+	palignr	$4,%xmm13,%xmm13
 	paddd	%xmm6,%xmm2
 	pxor	%xmm2,%xmm14
 	pshufb	.Lrol16(%rip),%xmm14
@@ -4060,9 +4060,9 @@
 	pslld	$7,%xmm3
 	psrld	$25,%xmm6
 	pxor	%xmm3,%xmm6
-.byte	102,15,58,15,246,12
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,4
+	palignr	$12,%xmm6,%xmm6
+	palignr	$8,%xmm10,%xmm10
+	palignr	$4,%xmm14,%xmm14
 
 	decq	%r10
 	jnz	.Lseal_sse_128_rounds
diff --git a/gen/crypto/chacha20_poly1305_x86_64-win.asm b/gen/crypto/chacha20_poly1305_x86_64-win.asm
index 25c69ef..7ff65db 100644
--- a/gen/crypto/chacha20_poly1305_x86_64-win.asm
+++ b/gen/crypto/chacha20_poly1305_x86_64-win.asm
@@ -308,9 +308,9 @@
 	pslld	xmm3,7
 	psrld	xmm4,25
 	pxor	xmm4,xmm3
-DB	102,15,58,15,228,4
-DB	102,69,15,58,15,192,8
-DB	102,69,15,58,15,228,12
+	palignr	xmm4,xmm4,4
+	palignr	xmm8,xmm8,8
+	palignr	xmm12,xmm12,12
 	paddd	xmm0,xmm4
 	pxor	xmm12,xmm0
 	pshufb	xmm12,XMMWORD[$L$rol16]
@@ -329,9 +329,9 @@
 	pslld	xmm3,7
 	psrld	xmm4,25
 	pxor	xmm4,xmm3
-DB	102,15,58,15,228,12
-DB	102,69,15,58,15,192,8
-DB	102,69,15,58,15,228,4
+	palignr	xmm4,xmm4,12
+	palignr	xmm8,xmm8,8
+	palignr	xmm12,xmm12,4
 
 	dec	r10
 	jne	NEAR $L$open_sse_init_rounds
@@ -389,10 +389,10 @@
 	pxor	xmm14,xmm2
 	pxor	xmm13,xmm1
 	pxor	xmm12,xmm0
-DB	102,69,15,56,0,248
-DB	102,69,15,56,0,240
-DB	102,69,15,56,0,232
-DB	102,69,15,56,0,224
+	pshufb	xmm15,xmm8
+	pshufb	xmm14,xmm8
+	pshufb	xmm13,xmm8
+	pshufb	xmm12,xmm8
 	movdqa	xmm8,XMMWORD[((160+80))+rbp]
 	paddd	xmm11,xmm15
 	paddd	xmm10,xmm14
@@ -443,10 +443,10 @@
 	pxor	xmm14,xmm2
 	pxor	xmm13,xmm1
 	pxor	xmm12,xmm0
-DB	102,69,15,56,0,248
-DB	102,69,15,56,0,240
-DB	102,69,15,56,0,232
-DB	102,69,15,56,0,224
+	pshufb	xmm15,xmm8
+	pshufb	xmm14,xmm8
+	pshufb	xmm13,xmm8
+	pshufb	xmm12,xmm8
 	movdqa	xmm8,XMMWORD[((160+80))+rbp]
 	paddd	xmm11,xmm15
 	paddd	xmm10,xmm14
@@ -487,18 +487,18 @@
 	imul	r9,r12
 	add	r15,r10
 	adc	r9,rdx
-DB	102,15,58,15,255,4
-DB	102,69,15,58,15,219,8
-DB	102,69,15,58,15,255,12
-DB	102,15,58,15,246,4
-DB	102,69,15,58,15,210,8
-DB	102,69,15,58,15,246,12
-DB	102,15,58,15,237,4
-DB	102,69,15,58,15,201,8
-DB	102,69,15,58,15,237,12
-DB	102,15,58,15,228,4
-DB	102,69,15,58,15,192,8
-DB	102,69,15,58,15,228,12
+	palignr	xmm7,xmm7,4
+	palignr	xmm11,xmm11,8
+	palignr	xmm15,xmm15,12
+	palignr	xmm6,xmm6,4
+	palignr	xmm10,xmm10,8
+	palignr	xmm14,xmm14,12
+	palignr	xmm5,xmm5,4
+	palignr	xmm9,xmm9,8
+	palignr	xmm13,xmm13,12
+	palignr	xmm4,xmm4,4
+	palignr	xmm8,xmm8,8
+	palignr	xmm12,xmm12,12
 	movdqa	XMMWORD[(160+80)+rbp],xmm8
 	movdqa	xmm8,XMMWORD[$L$rol16]
 	paddd	xmm3,xmm7
@@ -523,10 +523,10 @@
 	adc	r12,0
 	pxor	xmm13,xmm1
 	pxor	xmm12,xmm0
-DB	102,69,15,56,0,248
-DB	102,69,15,56,0,240
-DB	102,69,15,56,0,232
-DB	102,69,15,56,0,224
+	pshufb	xmm15,xmm8
+	pshufb	xmm14,xmm8
+	pshufb	xmm13,xmm8
+	pshufb	xmm12,xmm8
 	movdqa	xmm8,XMMWORD[((160+80))+rbp]
 	paddd	xmm11,xmm15
 	paddd	xmm10,xmm14
@@ -562,10 +562,10 @@
 	pxor	xmm14,xmm2
 	pxor	xmm13,xmm1
 	pxor	xmm12,xmm0
-DB	102,69,15,56,0,248
-DB	102,69,15,56,0,240
-DB	102,69,15,56,0,232
-DB	102,69,15,56,0,224
+	pshufb	xmm15,xmm8
+	pshufb	xmm14,xmm8
+	pshufb	xmm13,xmm8
+	pshufb	xmm12,xmm8
 	movdqa	xmm8,XMMWORD[((160+80))+rbp]
 	paddd	xmm11,xmm15
 	paddd	xmm10,xmm14
@@ -593,18 +593,18 @@
 	pslld	xmm4,32-25
 	pxor	xmm4,xmm8
 	movdqa	xmm8,XMMWORD[((160+80))+rbp]
-DB	102,15,58,15,255,12
-DB	102,69,15,58,15,219,8
-DB	102,69,15,58,15,255,4
-DB	102,15,58,15,246,12
-DB	102,69,15,58,15,210,8
-DB	102,69,15,58,15,246,4
-DB	102,15,58,15,237,12
-DB	102,69,15,58,15,201,8
-DB	102,69,15,58,15,237,4
-DB	102,15,58,15,228,12
-DB	102,69,15,58,15,192,8
-DB	102,69,15,58,15,228,4
+	palignr	xmm7,xmm7,12
+	palignr	xmm11,xmm11,8
+	palignr	xmm15,xmm15,4
+	palignr	xmm6,xmm6,12
+	palignr	xmm10,xmm10,8
+	palignr	xmm14,xmm14,4
+	palignr	xmm5,xmm5,12
+	palignr	xmm9,xmm9,8
+	palignr	xmm13,xmm13,4
+	palignr	xmm4,xmm4,12
+	palignr	xmm8,xmm8,8
+	palignr	xmm12,xmm12,4
 
 	dec	rcx
 	jge	NEAR $L$open_sse_main_loop_rounds
@@ -806,9 +806,9 @@
 	pslld	xmm3,7
 	psrld	xmm4,25
 	pxor	xmm4,xmm3
-DB	102,15,58,15,228,4
-DB	102,69,15,58,15,192,8
-DB	102,69,15,58,15,228,12
+	palignr	xmm4,xmm4,4
+	palignr	xmm8,xmm8,8
+	palignr	xmm12,xmm12,12
 	paddd	xmm0,xmm4
 	pxor	xmm12,xmm0
 	pshufb	xmm12,XMMWORD[$L$rol16]
@@ -827,9 +827,9 @@
 	pslld	xmm3,7
 	psrld	xmm4,25
 	pxor	xmm4,xmm3
-DB	102,15,58,15,228,12
-DB	102,69,15,58,15,192,8
-DB	102,69,15,58,15,228,4
+	palignr	xmm4,xmm4,12
+	palignr	xmm8,xmm8,8
+	palignr	xmm12,xmm12,4
 
 	cmp	rcx,16
 	jae	NEAR $L$open_sse_tail_64_rounds_and_x1hash
@@ -921,9 +921,9 @@
 	pslld	xmm3,7
 	psrld	xmm4,25
 	pxor	xmm4,xmm3
-DB	102,15,58,15,228,4
-DB	102,69,15,58,15,192,8
-DB	102,69,15,58,15,228,12
+	palignr	xmm4,xmm4,4
+	palignr	xmm8,xmm8,8
+	palignr	xmm12,xmm12,12
 	paddd	xmm1,xmm5
 	pxor	xmm13,xmm1
 	pshufb	xmm13,XMMWORD[$L$rol16]
@@ -942,9 +942,9 @@
 	pslld	xmm3,7
 	psrld	xmm5,25
 	pxor	xmm5,xmm3
-DB	102,15,58,15,237,4
-DB	102,69,15,58,15,201,8
-DB	102,69,15,58,15,237,12
+	palignr	xmm5,xmm5,4
+	palignr	xmm9,xmm9,8
+	palignr	xmm13,xmm13,12
 	paddd	xmm0,xmm4
 	pxor	xmm12,xmm0
 	pshufb	xmm12,XMMWORD[$L$rol16]
@@ -963,9 +963,9 @@
 	pslld	xmm3,7
 	psrld	xmm4,25
 	pxor	xmm4,xmm3
-DB	102,15,58,15,228,12
-DB	102,69,15,58,15,192,8
-DB	102,69,15,58,15,228,4
+	palignr	xmm4,xmm4,12
+	palignr	xmm8,xmm8,8
+	palignr	xmm12,xmm12,4
 	paddd	xmm1,xmm5
 	pxor	xmm13,xmm1
 	pshufb	xmm13,XMMWORD[$L$rol16]
@@ -984,9 +984,9 @@
 	pslld	xmm3,7
 	psrld	xmm5,25
 	pxor	xmm5,xmm3
-DB	102,15,58,15,237,12
-DB	102,69,15,58,15,201,8
-DB	102,69,15,58,15,237,4
+	palignr	xmm5,xmm5,12
+	palignr	xmm9,xmm9,8
+	palignr	xmm13,xmm13,4
 
 	cmp	r8,rcx
 	jb	NEAR $L$open_sse_tail_128_rounds_and_x1hash
@@ -1106,9 +1106,9 @@
 	pslld	xmm3,7
 	psrld	xmm4,25
 	pxor	xmm4,xmm3
-DB	102,15,58,15,228,4
-DB	102,69,15,58,15,192,8
-DB	102,69,15,58,15,228,12
+	palignr	xmm4,xmm4,4
+	palignr	xmm8,xmm8,8
+	palignr	xmm12,xmm12,12
 	paddd	xmm1,xmm5
 	pxor	xmm13,xmm1
 	pshufb	xmm13,XMMWORD[$L$rol16]
@@ -1127,9 +1127,9 @@
 	pslld	xmm3,7
 	psrld	xmm5,25
 	pxor	xmm5,xmm3
-DB	102,15,58,15,237,4
-DB	102,69,15,58,15,201,8
-DB	102,69,15,58,15,237,12
+	palignr	xmm5,xmm5,4
+	palignr	xmm9,xmm9,8
+	palignr	xmm13,xmm13,12
 	paddd	xmm2,xmm6
 	pxor	xmm14,xmm2
 	pshufb	xmm14,XMMWORD[$L$rol16]
@@ -1148,9 +1148,9 @@
 	pslld	xmm3,7
 	psrld	xmm6,25
 	pxor	xmm6,xmm3
-DB	102,15,58,15,246,4
-DB	102,69,15,58,15,210,8
-DB	102,69,15,58,15,246,12
+	palignr	xmm6,xmm6,4
+	palignr	xmm10,xmm10,8
+	palignr	xmm14,xmm14,12
 	paddd	xmm0,xmm4
 	pxor	xmm12,xmm0
 	pshufb	xmm12,XMMWORD[$L$rol16]
@@ -1169,9 +1169,9 @@
 	pslld	xmm3,7
 	psrld	xmm4,25
 	pxor	xmm4,xmm3
-DB	102,15,58,15,228,12
-DB	102,69,15,58,15,192,8
-DB	102,69,15,58,15,228,4
+	palignr	xmm4,xmm4,12
+	palignr	xmm8,xmm8,8
+	palignr	xmm12,xmm12,4
 	paddd	xmm1,xmm5
 	pxor	xmm13,xmm1
 	pshufb	xmm13,XMMWORD[$L$rol16]
@@ -1190,9 +1190,9 @@
 	pslld	xmm3,7
 	psrld	xmm5,25
 	pxor	xmm5,xmm3
-DB	102,15,58,15,237,12
-DB	102,69,15,58,15,201,8
-DB	102,69,15,58,15,237,4
+	palignr	xmm5,xmm5,12
+	palignr	xmm9,xmm9,8
+	palignr	xmm13,xmm13,4
 	paddd	xmm2,xmm6
 	pxor	xmm14,xmm2
 	pshufb	xmm14,XMMWORD[$L$rol16]
@@ -1211,9 +1211,9 @@
 	pslld	xmm3,7
 	psrld	xmm6,25
 	pxor	xmm6,xmm3
-DB	102,15,58,15,246,12
-DB	102,69,15,58,15,210,8
-DB	102,69,15,58,15,246,4
+	palignr	xmm6,xmm6,12
+	palignr	xmm10,xmm10,8
+	palignr	xmm14,xmm14,4
 
 	cmp	r8,rcx
 	jb	NEAR $L$open_sse_tail_192_rounds_and_x1hash
@@ -1398,9 +1398,9 @@
 	pslld	xmm11,7
 	psrld	xmm4,25
 	pxor	xmm4,xmm11
-DB	102,15,58,15,228,4
-DB	102,69,15,58,15,192,8
-DB	102,69,15,58,15,228,12
+	palignr	xmm4,xmm4,4
+	palignr	xmm8,xmm8,8
+	palignr	xmm12,xmm12,12
 	paddd	xmm1,xmm5
 	pxor	xmm13,xmm1
 	pshufb	xmm13,XMMWORD[$L$rol16]
@@ -1419,9 +1419,9 @@
 	pslld	xmm11,7
 	psrld	xmm5,25
 	pxor	xmm5,xmm11
-DB	102,15,58,15,237,4
-DB	102,69,15,58,15,201,8
-DB	102,69,15,58,15,237,12
+	palignr	xmm5,xmm5,4
+	palignr	xmm9,xmm9,8
+	palignr	xmm13,xmm13,12
 	paddd	xmm2,xmm6
 	pxor	xmm14,xmm2
 	pshufb	xmm14,XMMWORD[$L$rol16]
@@ -1440,9 +1440,9 @@
 	pslld	xmm11,7
 	psrld	xmm6,25
 	pxor	xmm6,xmm11
-DB	102,15,58,15,246,4
-DB	102,69,15,58,15,210,8
-DB	102,69,15,58,15,246,12
+	palignr	xmm6,xmm6,4
+	palignr	xmm10,xmm10,8
+	palignr	xmm14,xmm14,12
 	movdqa	xmm11,XMMWORD[((160+80))+rbp]
 	mov	rax,QWORD[((0+160+0))+rbp]
 	mov	r15,rax
@@ -1473,9 +1473,9 @@
 	pslld	xmm9,7
 	psrld	xmm7,25
 	pxor	xmm7,xmm9
-DB	102,15,58,15,255,4
-DB	102,69,15,58,15,219,8
-DB	102,69,15,58,15,255,12
+	palignr	xmm7,xmm7,4
+	palignr	xmm11,xmm11,8
+	palignr	xmm15,xmm15,12
 	movdqa	xmm9,XMMWORD[((160+80))+rbp]
 	mov	rax,QWORD[((8+160+0))+rbp]
 	mov	r9,rax
@@ -1506,9 +1506,9 @@
 	pslld	xmm11,7
 	psrld	xmm4,25
 	pxor	xmm4,xmm11
-DB	102,15,58,15,228,12
-DB	102,69,15,58,15,192,8
-DB	102,69,15,58,15,228,4
+	palignr	xmm4,xmm4,12
+	palignr	xmm8,xmm8,8
+	palignr	xmm12,xmm12,4
 	paddd	xmm1,xmm5
 	pxor	xmm13,xmm1
 	pshufb	xmm13,XMMWORD[$L$rol16]
@@ -1527,9 +1527,9 @@
 	pslld	xmm11,7
 	psrld	xmm5,25
 	pxor	xmm5,xmm11
-DB	102,15,58,15,237,12
-DB	102,69,15,58,15,201,8
-DB	102,69,15,58,15,237,4
+	palignr	xmm5,xmm5,12
+	palignr	xmm9,xmm9,8
+	palignr	xmm13,xmm13,4
 	imul	r9,r12
 	add	r15,r10
 	adc	r9,rdx
@@ -1551,9 +1551,9 @@
 	pslld	xmm11,7
 	psrld	xmm6,25
 	pxor	xmm6,xmm11
-DB	102,15,58,15,246,12
-DB	102,69,15,58,15,210,8
-DB	102,69,15,58,15,246,4
+	palignr	xmm6,xmm6,12
+	palignr	xmm10,xmm10,8
+	palignr	xmm14,xmm14,4
 	movdqa	xmm11,XMMWORD[((160+80))+rbp]
 	mov	r10,r13
 	mov	r11,r14
@@ -1588,9 +1588,9 @@
 	pslld	xmm9,7
 	psrld	xmm7,25
 	pxor	xmm7,xmm9
-DB	102,15,58,15,255,12
-DB	102,69,15,58,15,219,8
-DB	102,69,15,58,15,255,4
+	palignr	xmm7,xmm7,12
+	palignr	xmm11,xmm11,8
+	palignr	xmm15,xmm15,4
 	movdqa	xmm9,XMMWORD[((160+80))+rbp]
 
 	add	r8,16
@@ -1737,7 +1737,7 @@
 	sub	r8,1
 	jnz	NEAR $L$open_sse_tail_16_compose
 
-DB	102,73,15,126,221
+	movq	r13,xmm3
 	pextrq	r14,xmm3,1
 
 	pxor	xmm3,xmm1
@@ -1923,9 +1923,9 @@
 	pslld	xmm3,7
 	psrld	xmm4,25
 	pxor	xmm4,xmm3
-DB	102,15,58,15,228,4
-DB	102,69,15,58,15,192,8
-DB	102,69,15,58,15,228,12
+	palignr	xmm4,xmm4,4
+	palignr	xmm8,xmm8,8
+	palignr	xmm12,xmm12,12
 	paddd	xmm1,xmm5
 	pxor	xmm13,xmm1
 	pshufb	xmm13,XMMWORD[$L$rol16]
@@ -1944,9 +1944,9 @@
 	pslld	xmm3,7
 	psrld	xmm5,25
 	pxor	xmm5,xmm3
-DB	102,15,58,15,237,4
-DB	102,69,15,58,15,201,8
-DB	102,69,15,58,15,237,12
+	palignr	xmm5,xmm5,4
+	palignr	xmm9,xmm9,8
+	palignr	xmm13,xmm13,12
 	paddd	xmm2,xmm6
 	pxor	xmm14,xmm2
 	pshufb	xmm14,XMMWORD[$L$rol16]
@@ -1965,9 +1965,9 @@
 	pslld	xmm3,7
 	psrld	xmm6,25
 	pxor	xmm6,xmm3
-DB	102,15,58,15,246,4
-DB	102,69,15,58,15,210,8
-DB	102,69,15,58,15,246,12
+	palignr	xmm6,xmm6,4
+	palignr	xmm10,xmm10,8
+	palignr	xmm14,xmm14,12
 	paddd	xmm0,xmm4
 	pxor	xmm12,xmm0
 	pshufb	xmm12,XMMWORD[$L$rol16]
@@ -1986,9 +1986,9 @@
 	pslld	xmm3,7
 	psrld	xmm4,25
 	pxor	xmm4,xmm3
-DB	102,15,58,15,228,12
-DB	102,69,15,58,15,192,8
-DB	102,69,15,58,15,228,4
+	palignr	xmm4,xmm4,12
+	palignr	xmm8,xmm8,8
+	palignr	xmm12,xmm12,4
 	paddd	xmm1,xmm5
 	pxor	xmm13,xmm1
 	pshufb	xmm13,XMMWORD[$L$rol16]
@@ -2007,9 +2007,9 @@
 	pslld	xmm3,7
 	psrld	xmm5,25
 	pxor	xmm5,xmm3
-DB	102,15,58,15,237,12
-DB	102,69,15,58,15,201,8
-DB	102,69,15,58,15,237,4
+	palignr	xmm5,xmm5,12
+	palignr	xmm9,xmm9,8
+	palignr	xmm13,xmm13,4
 	paddd	xmm2,xmm6
 	pxor	xmm14,xmm2
 	pshufb	xmm14,XMMWORD[$L$rol16]
@@ -2028,9 +2028,9 @@
 	pslld	xmm3,7
 	psrld	xmm6,25
 	pxor	xmm6,xmm3
-DB	102,15,58,15,246,12
-DB	102,69,15,58,15,210,8
-DB	102,69,15,58,15,246,4
+	palignr	xmm6,xmm6,12
+	palignr	xmm10,xmm10,8
+	palignr	xmm14,xmm14,4
 
 	dec	r10
 	jnz	NEAR $L$open_sse_128_rounds
@@ -2220,10 +2220,10 @@
 	pxor	xmm14,xmm2
 	pxor	xmm13,xmm1
 	pxor	xmm12,xmm0
-DB	102,69,15,56,0,248
-DB	102,69,15,56,0,240
-DB	102,69,15,56,0,232
-DB	102,69,15,56,0,224
+	pshufb	xmm15,xmm8
+	pshufb	xmm14,xmm8
+	pshufb	xmm13,xmm8
+	pshufb	xmm12,xmm8
 	movdqa	xmm8,XMMWORD[((160+80))+rbp]
 	paddd	xmm11,xmm15
 	paddd	xmm10,xmm14
@@ -2259,10 +2259,10 @@
 	pxor	xmm14,xmm2
 	pxor	xmm13,xmm1
 	pxor	xmm12,xmm0
-DB	102,69,15,56,0,248
-DB	102,69,15,56,0,240
-DB	102,69,15,56,0,232
-DB	102,69,15,56,0,224
+	pshufb	xmm15,xmm8
+	pshufb	xmm14,xmm8
+	pshufb	xmm13,xmm8
+	pshufb	xmm12,xmm8
 	movdqa	xmm8,XMMWORD[((160+80))+rbp]
 	paddd	xmm11,xmm15
 	paddd	xmm10,xmm14
@@ -2290,18 +2290,18 @@
 	pslld	xmm4,32-25
 	pxor	xmm4,xmm8
 	movdqa	xmm8,XMMWORD[((160+80))+rbp]
-DB	102,15,58,15,255,4
-DB	102,69,15,58,15,219,8
-DB	102,69,15,58,15,255,12
-DB	102,15,58,15,246,4
-DB	102,69,15,58,15,210,8
-DB	102,69,15,58,15,246,12
-DB	102,15,58,15,237,4
-DB	102,69,15,58,15,201,8
-DB	102,69,15,58,15,237,12
-DB	102,15,58,15,228,4
-DB	102,69,15,58,15,192,8
-DB	102,69,15,58,15,228,12
+	palignr	xmm7,xmm7,4
+	palignr	xmm11,xmm11,8
+	palignr	xmm15,xmm15,12
+	palignr	xmm6,xmm6,4
+	palignr	xmm10,xmm10,8
+	palignr	xmm14,xmm14,12
+	palignr	xmm5,xmm5,4
+	palignr	xmm9,xmm9,8
+	palignr	xmm13,xmm13,12
+	palignr	xmm4,xmm4,4
+	palignr	xmm8,xmm8,8
+	palignr	xmm12,xmm12,12
 	movdqa	XMMWORD[(160+80)+rbp],xmm8
 	movdqa	xmm8,XMMWORD[$L$rol16]
 	paddd	xmm3,xmm7
@@ -2312,10 +2312,10 @@
 	pxor	xmm14,xmm2
 	pxor	xmm13,xmm1
 	pxor	xmm12,xmm0
-DB	102,69,15,56,0,248
-DB	102,69,15,56,0,240
-DB	102,69,15,56,0,232
-DB	102,69,15,56,0,224
+	pshufb	xmm15,xmm8
+	pshufb	xmm14,xmm8
+	pshufb	xmm13,xmm8
+	pshufb	xmm12,xmm8
 	movdqa	xmm8,XMMWORD[((160+80))+rbp]
 	paddd	xmm11,xmm15
 	paddd	xmm10,xmm14
@@ -2351,10 +2351,10 @@
 	pxor	xmm14,xmm2
 	pxor	xmm13,xmm1
 	pxor	xmm12,xmm0
-DB	102,69,15,56,0,248
-DB	102,69,15,56,0,240
-DB	102,69,15,56,0,232
-DB	102,69,15,56,0,224
+	pshufb	xmm15,xmm8
+	pshufb	xmm14,xmm8
+	pshufb	xmm13,xmm8
+	pshufb	xmm12,xmm8
 	movdqa	xmm8,XMMWORD[((160+80))+rbp]
 	paddd	xmm11,xmm15
 	paddd	xmm10,xmm14
@@ -2382,18 +2382,18 @@
 	pslld	xmm4,32-25
 	pxor	xmm4,xmm8
 	movdqa	xmm8,XMMWORD[((160+80))+rbp]
-DB	102,15,58,15,255,12
-DB	102,69,15,58,15,219,8
-DB	102,69,15,58,15,255,4
-DB	102,15,58,15,246,12
-DB	102,69,15,58,15,210,8
-DB	102,69,15,58,15,246,4
-DB	102,15,58,15,237,12
-DB	102,69,15,58,15,201,8
-DB	102,69,15,58,15,237,4
-DB	102,15,58,15,228,12
-DB	102,69,15,58,15,192,8
-DB	102,69,15,58,15,228,4
+	palignr	xmm7,xmm7,12
+	palignr	xmm11,xmm11,8
+	palignr	xmm15,xmm15,4
+	palignr	xmm6,xmm6,12
+	palignr	xmm10,xmm10,8
+	palignr	xmm14,xmm14,4
+	palignr	xmm5,xmm5,12
+	palignr	xmm9,xmm9,8
+	palignr	xmm13,xmm13,4
+	palignr	xmm4,xmm4,12
+	palignr	xmm8,xmm8,8
+	palignr	xmm12,xmm12,4
 
 	dec	r10
 	jnz	NEAR $L$seal_sse_init_rounds
@@ -2516,10 +2516,10 @@
 	pxor	xmm14,xmm2
 	pxor	xmm13,xmm1
 	pxor	xmm12,xmm0
-DB	102,69,15,56,0,248
-DB	102,69,15,56,0,240
-DB	102,69,15,56,0,232
-DB	102,69,15,56,0,224
+	pshufb	xmm15,xmm8
+	pshufb	xmm14,xmm8
+	pshufb	xmm13,xmm8
+	pshufb	xmm12,xmm8
 	movdqa	xmm8,XMMWORD[((160+80))+rbp]
 	paddd	xmm11,xmm15
 	paddd	xmm10,xmm14
@@ -2568,10 +2568,10 @@
 	pxor	xmm14,xmm2
 	pxor	xmm13,xmm1
 	pxor	xmm12,xmm0
-DB	102,69,15,56,0,248
-DB	102,69,15,56,0,240
-DB	102,69,15,56,0,232
-DB	102,69,15,56,0,224
+	pshufb	xmm15,xmm8
+	pshufb	xmm14,xmm8
+	pshufb	xmm13,xmm8
+	pshufb	xmm12,xmm8
 	movdqa	xmm8,XMMWORD[((160+80))+rbp]
 	paddd	xmm11,xmm15
 	paddd	xmm10,xmm14
@@ -2612,18 +2612,18 @@
 	imul	r9,r12
 	add	r15,r10
 	adc	r9,rdx
-DB	102,15,58,15,255,4
-DB	102,69,15,58,15,219,8
-DB	102,69,15,58,15,255,12
-DB	102,15,58,15,246,4
-DB	102,69,15,58,15,210,8
-DB	102,69,15,58,15,246,12
-DB	102,15,58,15,237,4
-DB	102,69,15,58,15,201,8
-DB	102,69,15,58,15,237,12
-DB	102,15,58,15,228,4
-DB	102,69,15,58,15,192,8
-DB	102,69,15,58,15,228,12
+	palignr	xmm7,xmm7,4
+	palignr	xmm11,xmm11,8
+	palignr	xmm15,xmm15,12
+	palignr	xmm6,xmm6,4
+	palignr	xmm10,xmm10,8
+	palignr	xmm14,xmm14,12
+	palignr	xmm5,xmm5,4
+	palignr	xmm9,xmm9,8
+	palignr	xmm13,xmm13,12
+	palignr	xmm4,xmm4,4
+	palignr	xmm8,xmm8,8
+	palignr	xmm12,xmm12,12
 	movdqa	XMMWORD[(160+80)+rbp],xmm8
 	movdqa	xmm8,XMMWORD[$L$rol16]
 	paddd	xmm3,xmm7
@@ -2648,10 +2648,10 @@
 	adc	r12,0
 	pxor	xmm13,xmm1
 	pxor	xmm12,xmm0
-DB	102,69,15,56,0,248
-DB	102,69,15,56,0,240
-DB	102,69,15,56,0,232
-DB	102,69,15,56,0,224
+	pshufb	xmm15,xmm8
+	pshufb	xmm14,xmm8
+	pshufb	xmm13,xmm8
+	pshufb	xmm12,xmm8
 	movdqa	xmm8,XMMWORD[((160+80))+rbp]
 	paddd	xmm11,xmm15
 	paddd	xmm10,xmm14
@@ -2687,10 +2687,10 @@
 	pxor	xmm14,xmm2
 	pxor	xmm13,xmm1
 	pxor	xmm12,xmm0
-DB	102,69,15,56,0,248
-DB	102,69,15,56,0,240
-DB	102,69,15,56,0,232
-DB	102,69,15,56,0,224
+	pshufb	xmm15,xmm8
+	pshufb	xmm14,xmm8
+	pshufb	xmm13,xmm8
+	pshufb	xmm12,xmm8
 	movdqa	xmm8,XMMWORD[((160+80))+rbp]
 	paddd	xmm11,xmm15
 	paddd	xmm10,xmm14
@@ -2718,18 +2718,18 @@
 	pslld	xmm4,32-25
 	pxor	xmm4,xmm8
 	movdqa	xmm8,XMMWORD[((160+80))+rbp]
-DB	102,15,58,15,255,12
-DB	102,69,15,58,15,219,8
-DB	102,69,15,58,15,255,4
-DB	102,15,58,15,246,12
-DB	102,69,15,58,15,210,8
-DB	102,69,15,58,15,246,4
-DB	102,15,58,15,237,12
-DB	102,69,15,58,15,201,8
-DB	102,69,15,58,15,237,4
-DB	102,15,58,15,228,12
-DB	102,69,15,58,15,192,8
-DB	102,69,15,58,15,228,4
+	palignr	xmm7,xmm7,12
+	palignr	xmm11,xmm11,8
+	palignr	xmm15,xmm15,4
+	palignr	xmm6,xmm6,12
+	palignr	xmm10,xmm10,8
+	palignr	xmm14,xmm14,4
+	palignr	xmm5,xmm5,12
+	palignr	xmm9,xmm9,8
+	palignr	xmm13,xmm13,4
+	palignr	xmm4,xmm4,12
+	palignr	xmm8,xmm8,8
+	palignr	xmm12,xmm12,4
 
 	lea	rdi,[16+rdi]
 	dec	r8
@@ -2942,9 +2942,9 @@
 	pslld	xmm3,7
 	psrld	xmm4,25
 	pxor	xmm4,xmm3
-DB	102,15,58,15,228,4
-DB	102,69,15,58,15,192,8
-DB	102,69,15,58,15,228,12
+	palignr	xmm4,xmm4,4
+	palignr	xmm8,xmm8,8
+	palignr	xmm12,xmm12,12
 	paddd	xmm0,xmm4
 	pxor	xmm12,xmm0
 	pshufb	xmm12,XMMWORD[$L$rol16]
@@ -2963,9 +2963,9 @@
 	pslld	xmm3,7
 	psrld	xmm4,25
 	pxor	xmm4,xmm3
-DB	102,15,58,15,228,12
-DB	102,69,15,58,15,192,8
-DB	102,69,15,58,15,228,4
+	palignr	xmm4,xmm4,12
+	palignr	xmm8,xmm8,8
+	palignr	xmm12,xmm12,4
 	add	r10,QWORD[((0+0))+rdi]
 	adc	r11,QWORD[((8+0))+rdi]
 	adc	r12,1
@@ -3095,9 +3095,9 @@
 	pslld	xmm3,7
 	psrld	xmm4,25
 	pxor	xmm4,xmm3
-DB	102,15,58,15,228,4
-DB	102,69,15,58,15,192,8
-DB	102,69,15,58,15,228,12
+	palignr	xmm4,xmm4,4
+	palignr	xmm8,xmm8,8
+	palignr	xmm12,xmm12,12
 	paddd	xmm1,xmm5
 	pxor	xmm13,xmm1
 	pshufb	xmm13,XMMWORD[$L$rol16]
@@ -3116,9 +3116,9 @@
 	pslld	xmm3,7
 	psrld	xmm5,25
 	pxor	xmm5,xmm3
-DB	102,15,58,15,237,4
-DB	102,69,15,58,15,201,8
-DB	102,69,15,58,15,237,12
+	palignr	xmm5,xmm5,4
+	palignr	xmm9,xmm9,8
+	palignr	xmm13,xmm13,12
 	add	r10,QWORD[((0+0))+rdi]
 	adc	r11,QWORD[((8+0))+rdi]
 	adc	r12,1
@@ -3177,9 +3177,9 @@
 	pslld	xmm3,7
 	psrld	xmm4,25
 	pxor	xmm4,xmm3
-DB	102,15,58,15,228,12
-DB	102,69,15,58,15,192,8
-DB	102,69,15,58,15,228,4
+	palignr	xmm4,xmm4,12
+	palignr	xmm8,xmm8,8
+	palignr	xmm12,xmm12,4
 	paddd	xmm1,xmm5
 	pxor	xmm13,xmm1
 	pshufb	xmm13,XMMWORD[$L$rol16]
@@ -3198,9 +3198,9 @@
 	pslld	xmm3,7
 	psrld	xmm5,25
 	pxor	xmm5,xmm3
-DB	102,15,58,15,237,12
-DB	102,69,15,58,15,201,8
-DB	102,69,15,58,15,237,4
+	palignr	xmm5,xmm5,12
+	palignr	xmm9,xmm9,8
+	palignr	xmm13,xmm13,4
 
 	lea	rdi,[16+rdi]
 	dec	rcx
@@ -3315,9 +3315,9 @@
 	pslld	xmm3,7
 	psrld	xmm4,25
 	pxor	xmm4,xmm3
-DB	102,15,58,15,228,4
-DB	102,69,15,58,15,192,8
-DB	102,69,15,58,15,228,12
+	palignr	xmm4,xmm4,4
+	palignr	xmm8,xmm8,8
+	palignr	xmm12,xmm12,12
 	paddd	xmm1,xmm5
 	pxor	xmm13,xmm1
 	pshufb	xmm13,XMMWORD[$L$rol16]
@@ -3336,9 +3336,9 @@
 	pslld	xmm3,7
 	psrld	xmm5,25
 	pxor	xmm5,xmm3
-DB	102,15,58,15,237,4
-DB	102,69,15,58,15,201,8
-DB	102,69,15,58,15,237,12
+	palignr	xmm5,xmm5,4
+	palignr	xmm9,xmm9,8
+	palignr	xmm13,xmm13,12
 	paddd	xmm2,xmm6
 	pxor	xmm14,xmm2
 	pshufb	xmm14,XMMWORD[$L$rol16]
@@ -3357,9 +3357,9 @@
 	pslld	xmm3,7
 	psrld	xmm6,25
 	pxor	xmm6,xmm3
-DB	102,15,58,15,246,4
-DB	102,69,15,58,15,210,8
-DB	102,69,15,58,15,246,12
+	palignr	xmm6,xmm6,4
+	palignr	xmm10,xmm10,8
+	palignr	xmm14,xmm14,12
 	add	r10,QWORD[((0+0))+rdi]
 	adc	r11,QWORD[((8+0))+rdi]
 	adc	r12,1
@@ -3418,9 +3418,9 @@
 	pslld	xmm3,7
 	psrld	xmm4,25
 	pxor	xmm4,xmm3
-DB	102,15,58,15,228,12
-DB	102,69,15,58,15,192,8
-DB	102,69,15,58,15,228,4
+	palignr	xmm4,xmm4,12
+	palignr	xmm8,xmm8,8
+	palignr	xmm12,xmm12,4
 	paddd	xmm1,xmm5
 	pxor	xmm13,xmm1
 	pshufb	xmm13,XMMWORD[$L$rol16]
@@ -3439,9 +3439,9 @@
 	pslld	xmm3,7
 	psrld	xmm5,25
 	pxor	xmm5,xmm3
-DB	102,15,58,15,237,12
-DB	102,69,15,58,15,201,8
-DB	102,69,15,58,15,237,4
+	palignr	xmm5,xmm5,12
+	palignr	xmm9,xmm9,8
+	palignr	xmm13,xmm13,4
 	paddd	xmm2,xmm6
 	pxor	xmm14,xmm2
 	pshufb	xmm14,XMMWORD[$L$rol16]
@@ -3460,9 +3460,9 @@
 	pslld	xmm3,7
 	psrld	xmm6,25
 	pxor	xmm6,xmm3
-DB	102,15,58,15,246,12
-DB	102,69,15,58,15,210,8
-DB	102,69,15,58,15,246,4
+	palignr	xmm6,xmm6,12
+	palignr	xmm10,xmm10,8
+	palignr	xmm14,xmm14,4
 
 	lea	rdi,[16+rdi]
 	dec	rcx
@@ -3714,7 +3714,7 @@
 
 
 
-DB	102,77,15,126,253
+	movq	r13,xmm15
 	pextrq	r14,xmm15,1
 	add	r10,r13
 	adc	r11,r14
@@ -3830,7 +3830,7 @@
 	lea	r15,[$L$and_masks]
 	shl	rbx,4
 	pand	xmm15,XMMWORD[((-16))+rbx*1+r15]
-DB	102,77,15,126,253
+	movq	r13,xmm15
 	pextrq	r14,xmm15,1
 	add	r10,r13
 	adc	r11,r14
@@ -4005,9 +4005,9 @@
 	pslld	xmm3,7
 	psrld	xmm4,25
 	pxor	xmm4,xmm3
-DB	102,15,58,15,228,4
-DB	102,69,15,58,15,192,8
-DB	102,69,15,58,15,228,12
+	palignr	xmm4,xmm4,4
+	palignr	xmm8,xmm8,8
+	palignr	xmm12,xmm12,12
 	paddd	xmm1,xmm5
 	pxor	xmm13,xmm1
 	pshufb	xmm13,XMMWORD[$L$rol16]
@@ -4026,9 +4026,9 @@
 	pslld	xmm3,7
 	psrld	xmm5,25
 	pxor	xmm5,xmm3
-DB	102,15,58,15,237,4
-DB	102,69,15,58,15,201,8
-DB	102,69,15,58,15,237,12
+	palignr	xmm5,xmm5,4
+	palignr	xmm9,xmm9,8
+	palignr	xmm13,xmm13,12
 	paddd	xmm2,xmm6
 	pxor	xmm14,xmm2
 	pshufb	xmm14,XMMWORD[$L$rol16]
@@ -4047,9 +4047,9 @@
 	pslld	xmm3,7
 	psrld	xmm6,25
 	pxor	xmm6,xmm3
-DB	102,15,58,15,246,4
-DB	102,69,15,58,15,210,8
-DB	102,69,15,58,15,246,12
+	palignr	xmm6,xmm6,4
+	palignr	xmm10,xmm10,8
+	palignr	xmm14,xmm14,12
 	paddd	xmm0,xmm4
 	pxor	xmm12,xmm0
 	pshufb	xmm12,XMMWORD[$L$rol16]
@@ -4068,9 +4068,9 @@
 	pslld	xmm3,7
 	psrld	xmm4,25
 	pxor	xmm4,xmm3
-DB	102,15,58,15,228,12
-DB	102,69,15,58,15,192,8
-DB	102,69,15,58,15,228,4
+	palignr	xmm4,xmm4,12
+	palignr	xmm8,xmm8,8
+	palignr	xmm12,xmm12,4
 	paddd	xmm1,xmm5
 	pxor	xmm13,xmm1
 	pshufb	xmm13,XMMWORD[$L$rol16]
@@ -4089,9 +4089,9 @@
 	pslld	xmm3,7
 	psrld	xmm5,25
 	pxor	xmm5,xmm3
-DB	102,15,58,15,237,12
-DB	102,69,15,58,15,201,8
-DB	102,69,15,58,15,237,4
+	palignr	xmm5,xmm5,12
+	palignr	xmm9,xmm9,8
+	palignr	xmm13,xmm13,4
 	paddd	xmm2,xmm6
 	pxor	xmm14,xmm2
 	pshufb	xmm14,XMMWORD[$L$rol16]
@@ -4110,9 +4110,9 @@
 	pslld	xmm3,7
 	psrld	xmm6,25
 	pxor	xmm6,xmm3
-DB	102,15,58,15,246,12
-DB	102,69,15,58,15,210,8
-DB	102,69,15,58,15,246,4
+	palignr	xmm6,xmm6,12
+	palignr	xmm10,xmm10,8
+	palignr	xmm14,xmm14,4
 
 	dec	r10
 	jnz	NEAR $L$seal_sse_128_rounds