Stop manually encoding a bunch of x86-64 instructions
The perlasm code used to manually encode some instructions, presumably
to accomodate older assemblers that don't recognize them. The newest of
these (SHA instructions) seem to have been added in binutils 2.24,
released in 2013.
Remove the transforms so we don't have to worry about bugs in some
ad-hoc perl code. I confirmed this was equivalent by comparing the
output of `objdump -d` on the assembled object files.
This revealed one issue in the xlate script where it tried to suffix
rdrand, which is apparently unsuffixable.
Change-Id: I51377e38ec06b099e730da29b85743188abf9723
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/77388
Commit-Queue: Bob Beck <bbe@google.com>
Reviewed-by: Bob Beck <bbe@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
Auto-Submit: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/aes/asm/aesni-x86_64.pl b/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
index 08df44e..930ae14 100644
--- a/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
+++ b/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
@@ -3983,64 +3983,7 @@
___
}
-sub rex {
- local *opcode=shift;
- my ($dst,$src)=@_;
- my $rex=0;
-
- $rex|=0x04 if($dst>=8);
- $rex|=0x01 if($src>=8);
- push @opcode,$rex|0x40 if($rex);
-}
-
-sub aesni {
- my $line=shift;
- my @opcode=(0x66);
-
- if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
- rex(\@opcode,$4,$3);
- push @opcode,0x0f,0x3a,0xdf;
- push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
- my $c=$2;
- push @opcode,$c=~/^0/?oct($c):$c;
- return ".byte\t".join(',',@opcode);
- }
- elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
- my %opcodelet = (
- "aesimc" => 0xdb,
- "aesenc" => 0xdc, "aesenclast" => 0xdd,
- "aesdec" => 0xde, "aesdeclast" => 0xdf
- );
- return undef if (!defined($opcodelet{$1}));
- rex(\@opcode,$3,$2);
- push @opcode,0x0f,0x38,$opcodelet{$1};
- push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
- return ".byte\t".join(',',@opcode);
- }
- elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
- my %opcodelet = (
- "aesenc" => 0xdc, "aesenclast" => 0xdd,
- "aesdec" => 0xde, "aesdeclast" => 0xdf
- );
- return undef if (!defined($opcodelet{$1}));
- my $off = $2;
- push @opcode,0x44 if ($3>=8);
- push @opcode,0x0f,0x38,$opcodelet{$1};
- push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
- push @opcode,($off=~/^0/?oct($off):$off)&0xff;
- return ".byte\t".join(',',@opcode);
- }
- return $line;
-}
-
-sub movbe {
- ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift;
-}
-
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
-$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
-#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact
-$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
print $code;
diff --git a/crypto/fipsmodule/sha/asm/sha1-x86_64.pl b/crypto/fipsmodule/sha/asm/sha1-x86_64.pl
index 2dc24f2..30f1238 100755
--- a/crypto/fipsmodule/sha/asm/sha1-x86_64.pl
+++ b/crypto/fipsmodule/sha/asm/sha1-x86_64.pl
@@ -2057,45 +2057,9 @@
####################################################################
-sub sha1rnds4 {
- if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
- my @opcode=(0x0f,0x3a,0xcc);
- push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
- my $c=$1;
- push @opcode,$c=~/^0/?oct($c):$c;
- return ".byte\t".join(',',@opcode);
- } else {
- return "sha1rnds4\t".@_[0];
- }
-}
-
-sub sha1op38 {
- my $instr = shift;
- my %opcodelet = (
- "sha1nexte" => 0xc8,
- "sha1msg1" => 0xc9,
- "sha1msg2" => 0xca );
-
- if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
- my @opcode=(0x0f,0x38);
- my $rex=0;
- $rex|=0x04 if ($2>=8);
- $rex|=0x01 if ($1>=8);
- unshift @opcode,0x40|$rex if ($rex);
- push @opcode,$opcodelet{$instr};
- push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
- return ".byte\t".join(',',@opcode);
- } else {
- return $instr."\t".@_[0];
- }
-}
-
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
- s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or
- s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;
-
print $_,"\n";
}
close STDOUT or die "error closing STDOUT: $!";
diff --git a/crypto/fipsmodule/sha/asm/sha512-x86_64.pl b/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
index 3a31a16..6768bf3 100755
--- a/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
+++ b/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
@@ -2042,28 +2042,9 @@
___
}
-sub sha256op38 {
- my $instr = shift;
- my %opcodelet = (
- "sha256rnds2" => 0xcb,
- "sha256msg1" => 0xcc,
- "sha256msg2" => 0xcd );
-
- if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
- my @opcode=(0x0f,0x38);
- push @opcode,$opcodelet{$instr};
- push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
- return ".byte\t".join(',',@opcode);
- } else {
- return $instr."\t".@_[0];
- }
-}
-
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
- s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
-
print $_,"\n";
}
close STDOUT or die "error closing STDOUT: $!";
diff --git a/crypto/perlasm/x86_64-xlate.pl b/crypto/perlasm/x86_64-xlate.pl
index a0bade3..2045529 100755
--- a/crypto/perlasm/x86_64-xlate.pl
+++ b/crypto/perlasm/x86_64-xlate.pl
@@ -127,7 +127,7 @@
if ($self->{op} =~ /^(movz)x?([bw]).*/) { # movz is pain...
$self->{op} = $1;
$self->{sz} = $2;
- } elsif ($self->{op} =~ /call|jmp/) {
+ } elsif ($self->{op} =~ /call|jmp|^rdrand$/) {
$self->{sz} = "";
} elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op|insrw)/) { # SSEn
$self->{sz} = "";
@@ -1319,191 +1319,6 @@
}
}
-# Upon initial x86_64 introduction SSE>2 extensions were not introduced
-# yet. In order not to be bothered by tracing exact assembler versions,
-# but at the same time to provide a bare security minimum of AES-NI, we
-# hard-code some instructions. Extensions past AES-NI on the other hand
-# are traced by examining assembler version in individual perlasm
-# modules...
-
-my %regrm = ( "%eax"=>0, "%ecx"=>1, "%edx"=>2, "%ebx"=>3,
- "%esp"=>4, "%ebp"=>5, "%esi"=>6, "%edi"=>7 );
-
-sub rex {
- my $opcode=shift;
- my ($dst,$src,$rex)=@_;
-
- $rex|=0x04 if($dst>=8);
- $rex|=0x01 if($src>=8);
- push @$opcode,($rex|0x40) if ($rex);
-}
-
-my $movq = sub { # elderly gas can't handle inter-register movq
- my $arg = shift;
- my @opcode=(0x66);
- if ($arg =~ /%xmm([0-9]+),\s*%r(\w+)/) {
- my ($src,$dst)=($1,$2);
- if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
- rex(\@opcode,$src,$dst,0x8);
- push @opcode,0x0f,0x7e;
- push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M
- @opcode;
- } elsif ($arg =~ /%r(\w+),\s*%xmm([0-9]+)/) {
- my ($src,$dst)=($2,$1);
- if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
- rex(\@opcode,$src,$dst,0x8);
- push @opcode,0x0f,0x6e;
- push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M
- @opcode;
- } else {
- ();
- }
-};
-
-my $pextrd = sub {
- if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*(%\w+)/) {
- my @opcode=(0x66);
- my $imm=$1;
- my $src=$2;
- my $dst=$3;
- if ($dst =~ /%r([0-9]+)d/) { $dst = $1; }
- elsif ($dst =~ /%e/) { $dst = $regrm{$dst}; }
- rex(\@opcode,$src,$dst);
- push @opcode,0x0f,0x3a,0x16;
- push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M
- push @opcode,$imm;
- @opcode;
- } else {
- ();
- }
-};
-
-my $pinsrd = sub {
- if (shift =~ /\$([0-9]+),\s*(%\w+),\s*%xmm([0-9]+)/) {
- my @opcode=(0x66);
- my $imm=$1;
- my $src=$2;
- my $dst=$3;
- if ($src =~ /%r([0-9]+)/) { $src = $1; }
- elsif ($src =~ /%e/) { $src = $regrm{$src}; }
- rex(\@opcode,$dst,$src);
- push @opcode,0x0f,0x3a,0x22;
- push @opcode,0xc0|(($dst&7)<<3)|($src&7); # ModR/M
- push @opcode,$imm;
- @opcode;
- } else {
- ();
- }
-};
-
-my $pshufb = sub {
- if (shift =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
- my @opcode=(0x66);
- rex(\@opcode,$2,$1);
- push @opcode,0x0f,0x38,0x00;
- push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
- @opcode;
- } else {
- ();
- }
-};
-
-my $palignr = sub {
- if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
- my @opcode=(0x66);
- rex(\@opcode,$3,$2);
- push @opcode,0x0f,0x3a,0x0f;
- push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
- push @opcode,$1;
- @opcode;
- } else {
- ();
- }
-};
-
-my $pclmulqdq = sub {
- if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
- my @opcode=(0x66);
- rex(\@opcode,$3,$2);
- push @opcode,0x0f,0x3a,0x44;
- push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
- my $c=$1;
- push @opcode,$c=~/^0/?oct($c):$c;
- @opcode;
- } else {
- ();
- }
-};
-
-my $rdrand = sub {
- if (shift =~ /%[er](\w+)/) {
- my @opcode=();
- my $dst=$1;
- if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
- rex(\@opcode,0,$dst,8);
- push @opcode,0x0f,0xc7,0xf0|($dst&7);
- @opcode;
- } else {
- ();
- }
-};
-
-my $rdseed = sub {
- if (shift =~ /%[er](\w+)/) {
- my @opcode=();
- my $dst=$1;
- if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
- rex(\@opcode,0,$dst,8);
- push @opcode,0x0f,0xc7,0xf8|($dst&7);
- @opcode;
- } else {
- ();
- }
-};
-
-# Not all AVX-capable assemblers recognize AMD XOP extension. Since we
-# are using only two instructions hand-code them in order to be excused
-# from chasing assembler versions...
-
-sub rxb {
- my $opcode=shift;
- my ($dst,$src1,$src2,$rxb)=@_;
-
- $rxb|=0x7<<5;
- $rxb&=~(0x04<<5) if($dst>=8);
- $rxb&=~(0x01<<5) if($src1>=8);
- $rxb&=~(0x02<<5) if($src2>=8);
- push @$opcode,$rxb;
-}
-
-my $vprotd = sub {
- if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
- my @opcode=(0x8f);
- rxb(\@opcode,$3,$2,-1,0x08);
- push @opcode,0x78,0xc2;
- push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
- my $c=$1;
- push @opcode,$c=~/^0/?oct($c):$c;
- @opcode;
- } else {
- ();
- }
-};
-
-my $vprotq = sub {
- if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
- my @opcode=(0x8f);
- rxb(\@opcode,$3,$2,-1,0x08);
- push @opcode,0x78,0xc3;
- push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
- my $c=$1;
- push @opcode,$c=~/^0/?oct($c):$c;
- @opcode;
- } else {
- ();
- }
-};
-
# Intel Control-flow Enforcement Technology extension. All functions and
# indirect branch targets will have to start with this instruction...
diff --git a/gen/bcm/aesni-x86_64-apple.S b/gen/bcm/aesni-x86_64-apple.S
index 0247a2d..958cc5a 100644
--- a/gen/bcm/aesni-x86_64-apple.S
+++ b/gen/bcm/aesni-x86_64-apple.S
@@ -23,12 +23,12 @@
leaq 32(%rdx),%rdx
xorps %xmm0,%xmm2
L$oop_enc1_1:
-.byte 102,15,56,220,209
+ aesenc %xmm1,%xmm2
decl %eax
movups (%rdx),%xmm1
leaq 16(%rdx),%rdx
jnz L$oop_enc1_1
-.byte 102,15,56,221,209
+ aesenclast %xmm1,%xmm2
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
movups %xmm2,(%rsi)
@@ -51,12 +51,12 @@
leaq 32(%rdx),%rdx
xorps %xmm0,%xmm2
L$oop_dec1_2:
-.byte 102,15,56,222,209
+ aesdec %xmm1,%xmm2
decl %eax
movups (%rdx),%xmm1
leaq 16(%rdx),%rdx
jnz L$oop_dec1_2
-.byte 102,15,56,223,209
+ aesdeclast %xmm1,%xmm2
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
movups %xmm2,(%rsi)
@@ -79,19 +79,19 @@
addq $16,%rax
L$enc_loop2:
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
+ aesenc %xmm0,%xmm2
+ aesenc %xmm0,%xmm3
movups -16(%rcx,%rax,1),%xmm0
jnz L$enc_loop2
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,221,208
-.byte 102,15,56,221,216
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
+ aesenclast %xmm0,%xmm2
+ aesenclast %xmm0,%xmm3
ret
@@ -110,19 +110,19 @@
addq $16,%rax
L$dec_loop2:
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
+ aesdec %xmm0,%xmm2
+ aesdec %xmm0,%xmm3
movups -16(%rcx,%rax,1),%xmm0
jnz L$dec_loop2
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,223,208
-.byte 102,15,56,223,216
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
+ aesdeclast %xmm0,%xmm2
+ aesdeclast %xmm0,%xmm3
ret
@@ -142,23 +142,23 @@
addq $16,%rax
L$enc_loop3:
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
+ aesenc %xmm1,%xmm4
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
-.byte 102,15,56,220,224
+ aesenc %xmm0,%xmm2
+ aesenc %xmm0,%xmm3
+ aesenc %xmm0,%xmm4
movups -16(%rcx,%rax,1),%xmm0
jnz L$enc_loop3
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,221,208
-.byte 102,15,56,221,216
-.byte 102,15,56,221,224
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
+ aesenc %xmm1,%xmm4
+ aesenclast %xmm0,%xmm2
+ aesenclast %xmm0,%xmm3
+ aesenclast %xmm0,%xmm4
ret
@@ -178,23 +178,23 @@
addq $16,%rax
L$dec_loop3:
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
+ aesdec %xmm1,%xmm4
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
+ aesdec %xmm0,%xmm2
+ aesdec %xmm0,%xmm3
+ aesdec %xmm0,%xmm4
movups -16(%rcx,%rax,1),%xmm0
jnz L$dec_loop3
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,223,208
-.byte 102,15,56,223,216
-.byte 102,15,56,223,224
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
+ aesdec %xmm1,%xmm4
+ aesdeclast %xmm0,%xmm2
+ aesdeclast %xmm0,%xmm3
+ aesdeclast %xmm0,%xmm4
ret
@@ -216,27 +216,27 @@
addq $16,%rax
L$enc_loop4:
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
+ aesenc %xmm1,%xmm4
+ aesenc %xmm1,%xmm5
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
-.byte 102,15,56,220,224
-.byte 102,15,56,220,232
+ aesenc %xmm0,%xmm2
+ aesenc %xmm0,%xmm3
+ aesenc %xmm0,%xmm4
+ aesenc %xmm0,%xmm5
movups -16(%rcx,%rax,1),%xmm0
jnz L$enc_loop4
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
-.byte 102,15,56,221,208
-.byte 102,15,56,221,216
-.byte 102,15,56,221,224
-.byte 102,15,56,221,232
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
+ aesenc %xmm1,%xmm4
+ aesenc %xmm1,%xmm5
+ aesenclast %xmm0,%xmm2
+ aesenclast %xmm0,%xmm3
+ aesenclast %xmm0,%xmm4
+ aesenclast %xmm0,%xmm5
ret
@@ -258,27 +258,27 @@
addq $16,%rax
L$dec_loop4:
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
+ aesdec %xmm1,%xmm4
+ aesdec %xmm1,%xmm5
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
-.byte 102,15,56,222,232
+ aesdec %xmm0,%xmm2
+ aesdec %xmm0,%xmm3
+ aesdec %xmm0,%xmm4
+ aesdec %xmm0,%xmm5
movups -16(%rcx,%rax,1),%xmm0
jnz L$dec_loop4
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,223,208
-.byte 102,15,56,223,216
-.byte 102,15,56,223,224
-.byte 102,15,56,223,232
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
+ aesdec %xmm1,%xmm4
+ aesdec %xmm1,%xmm5
+ aesdeclast %xmm0,%xmm2
+ aesdeclast %xmm0,%xmm3
+ aesdeclast %xmm0,%xmm4
+ aesdeclast %xmm0,%xmm5
ret
@@ -292,49 +292,49 @@
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
-.byte 102,15,56,220,209
+ aesenc %xmm1,%xmm2
leaq 32(%rcx,%rax,1),%rcx
negq %rax
-.byte 102,15,56,220,217
+ aesenc %xmm1,%xmm3
pxor %xmm0,%xmm5
pxor %xmm0,%xmm6
-.byte 102,15,56,220,225
+ aesenc %xmm1,%xmm4
pxor %xmm0,%xmm7
movups (%rcx,%rax,1),%xmm0
addq $16,%rax
jmp L$enc_loop6_enter
.p2align 4
L$enc_loop6:
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
+ aesenc %xmm1,%xmm4
L$enc_loop6_enter:
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
+ aesenc %xmm1,%xmm5
+ aesenc %xmm1,%xmm6
+ aesenc %xmm1,%xmm7
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
-.byte 102,15,56,220,224
-.byte 102,15,56,220,232
-.byte 102,15,56,220,240
-.byte 102,15,56,220,248
+ aesenc %xmm0,%xmm2
+ aesenc %xmm0,%xmm3
+ aesenc %xmm0,%xmm4
+ aesenc %xmm0,%xmm5
+ aesenc %xmm0,%xmm6
+ aesenc %xmm0,%xmm7
movups -16(%rcx,%rax,1),%xmm0
jnz L$enc_loop6
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,15,56,221,208
-.byte 102,15,56,221,216
-.byte 102,15,56,221,224
-.byte 102,15,56,221,232
-.byte 102,15,56,221,240
-.byte 102,15,56,221,248
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
+ aesenc %xmm1,%xmm4
+ aesenc %xmm1,%xmm5
+ aesenc %xmm1,%xmm6
+ aesenc %xmm1,%xmm7
+ aesenclast %xmm0,%xmm2
+ aesenclast %xmm0,%xmm3
+ aesenclast %xmm0,%xmm4
+ aesenclast %xmm0,%xmm5
+ aesenclast %xmm0,%xmm6
+ aesenclast %xmm0,%xmm7
ret
@@ -348,49 +348,49 @@
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
-.byte 102,15,56,222,209
+ aesdec %xmm1,%xmm2
leaq 32(%rcx,%rax,1),%rcx
negq %rax
-.byte 102,15,56,222,217
+ aesdec %xmm1,%xmm3
pxor %xmm0,%xmm5
pxor %xmm0,%xmm6
-.byte 102,15,56,222,225
+ aesdec %xmm1,%xmm4
pxor %xmm0,%xmm7
movups (%rcx,%rax,1),%xmm0
addq $16,%rax
jmp L$dec_loop6_enter
.p2align 4
L$dec_loop6:
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
+ aesdec %xmm1,%xmm4
L$dec_loop6_enter:
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
+ aesdec %xmm1,%xmm5
+ aesdec %xmm1,%xmm6
+ aesdec %xmm1,%xmm7
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
-.byte 102,15,56,222,232
-.byte 102,15,56,222,240
-.byte 102,15,56,222,248
+ aesdec %xmm0,%xmm2
+ aesdec %xmm0,%xmm3
+ aesdec %xmm0,%xmm4
+ aesdec %xmm0,%xmm5
+ aesdec %xmm0,%xmm6
+ aesdec %xmm0,%xmm7
movups -16(%rcx,%rax,1),%xmm0
jnz L$dec_loop6
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,15,56,223,208
-.byte 102,15,56,223,216
-.byte 102,15,56,223,224
-.byte 102,15,56,223,232
-.byte 102,15,56,223,240
-.byte 102,15,56,223,248
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
+ aesdec %xmm1,%xmm4
+ aesdec %xmm1,%xmm5
+ aesdec %xmm1,%xmm6
+ aesdec %xmm1,%xmm7
+ aesdeclast %xmm0,%xmm2
+ aesdeclast %xmm0,%xmm3
+ aesdeclast %xmm0,%xmm4
+ aesdeclast %xmm0,%xmm5
+ aesdeclast %xmm0,%xmm6
+ aesdeclast %xmm0,%xmm7
ret
@@ -408,55 +408,55 @@
pxor %xmm0,%xmm6
leaq 32(%rcx,%rax,1),%rcx
negq %rax
-.byte 102,15,56,220,209
+ aesenc %xmm1,%xmm2
pxor %xmm0,%xmm7
pxor %xmm0,%xmm8
-.byte 102,15,56,220,217
+ aesenc %xmm1,%xmm3
pxor %xmm0,%xmm9
movups (%rcx,%rax,1),%xmm0
addq $16,%rax
jmp L$enc_loop8_inner
.p2align 4
L$enc_loop8:
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
L$enc_loop8_inner:
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
+ aesenc %xmm1,%xmm4
+ aesenc %xmm1,%xmm5
+ aesenc %xmm1,%xmm6
+ aesenc %xmm1,%xmm7
+ aesenc %xmm1,%xmm8
+ aesenc %xmm1,%xmm9
L$enc_loop8_enter:
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
-.byte 102,15,56,220,224
-.byte 102,15,56,220,232
-.byte 102,15,56,220,240
-.byte 102,15,56,220,248
-.byte 102,68,15,56,220,192
-.byte 102,68,15,56,220,200
+ aesenc %xmm0,%xmm2
+ aesenc %xmm0,%xmm3
+ aesenc %xmm0,%xmm4
+ aesenc %xmm0,%xmm5
+ aesenc %xmm0,%xmm6
+ aesenc %xmm0,%xmm7
+ aesenc %xmm0,%xmm8
+ aesenc %xmm0,%xmm9
movups -16(%rcx,%rax,1),%xmm0
jnz L$enc_loop8
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
-.byte 102,15,56,221,208
-.byte 102,15,56,221,216
-.byte 102,15,56,221,224
-.byte 102,15,56,221,232
-.byte 102,15,56,221,240
-.byte 102,15,56,221,248
-.byte 102,68,15,56,221,192
-.byte 102,68,15,56,221,200
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
+ aesenc %xmm1,%xmm4
+ aesenc %xmm1,%xmm5
+ aesenc %xmm1,%xmm6
+ aesenc %xmm1,%xmm7
+ aesenc %xmm1,%xmm8
+ aesenc %xmm1,%xmm9
+ aesenclast %xmm0,%xmm2
+ aesenclast %xmm0,%xmm3
+ aesenclast %xmm0,%xmm4
+ aesenclast %xmm0,%xmm5
+ aesenclast %xmm0,%xmm6
+ aesenclast %xmm0,%xmm7
+ aesenclast %xmm0,%xmm8
+ aesenclast %xmm0,%xmm9
ret
@@ -474,55 +474,55 @@
pxor %xmm0,%xmm6
leaq 32(%rcx,%rax,1),%rcx
negq %rax
-.byte 102,15,56,222,209
+ aesdec %xmm1,%xmm2
pxor %xmm0,%xmm7
pxor %xmm0,%xmm8
-.byte 102,15,56,222,217
+ aesdec %xmm1,%xmm3
pxor %xmm0,%xmm9
movups (%rcx,%rax,1),%xmm0
addq $16,%rax
jmp L$dec_loop8_inner
.p2align 4
L$dec_loop8:
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
L$dec_loop8_inner:
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
+ aesdec %xmm1,%xmm4
+ aesdec %xmm1,%xmm5
+ aesdec %xmm1,%xmm6
+ aesdec %xmm1,%xmm7
+ aesdec %xmm1,%xmm8
+ aesdec %xmm1,%xmm9
L$dec_loop8_enter:
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
-.byte 102,15,56,222,232
-.byte 102,15,56,222,240
-.byte 102,15,56,222,248
-.byte 102,68,15,56,222,192
-.byte 102,68,15,56,222,200
+ aesdec %xmm0,%xmm2
+ aesdec %xmm0,%xmm3
+ aesdec %xmm0,%xmm4
+ aesdec %xmm0,%xmm5
+ aesdec %xmm0,%xmm6
+ aesdec %xmm0,%xmm7
+ aesdec %xmm0,%xmm8
+ aesdec %xmm0,%xmm9
movups -16(%rcx,%rax,1),%xmm0
jnz L$dec_loop8
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
-.byte 102,15,56,223,208
-.byte 102,15,56,223,216
-.byte 102,15,56,223,224
-.byte 102,15,56,223,232
-.byte 102,15,56,223,240
-.byte 102,15,56,223,248
-.byte 102,68,15,56,223,192
-.byte 102,68,15,56,223,200
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
+ aesdec %xmm1,%xmm4
+ aesdec %xmm1,%xmm5
+ aesdec %xmm1,%xmm6
+ aesdec %xmm1,%xmm7
+ aesdec %xmm1,%xmm8
+ aesdec %xmm1,%xmm9
+ aesdeclast %xmm0,%xmm2
+ aesdeclast %xmm0,%xmm3
+ aesdeclast %xmm0,%xmm4
+ aesdeclast %xmm0,%xmm5
+ aesdeclast %xmm0,%xmm6
+ aesdeclast %xmm0,%xmm7
+ aesdeclast %xmm0,%xmm8
+ aesdeclast %xmm0,%xmm9
ret
@@ -634,12 +634,12 @@
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
L$oop_enc1_3:
-.byte 102,15,56,220,209
+ aesenc %xmm1,%xmm2
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz L$oop_enc1_3
-.byte 102,15,56,221,209
+ aesenclast %xmm1,%xmm2
movups %xmm2,(%rsi)
jmp L$ecb_ret
.p2align 4
@@ -795,12 +795,12 @@
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
L$oop_dec1_4:
-.byte 102,15,56,222,209
+ aesdec %xmm1,%xmm2
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz L$oop_dec1_4
-.byte 102,15,56,223,209
+ aesdeclast %xmm1,%xmm2
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
jmp L$ecb_ret
@@ -895,12 +895,12 @@
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
L$oop_enc1_5:
-.byte 102,15,56,220,209
+ aesenc %xmm1,%xmm2
decl %edx
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz L$oop_enc1_5
-.byte 102,15,56,221,209
+ aesenclast %xmm1,%xmm2
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
xorps %xmm3,%xmm2
@@ -943,17 +943,17 @@
bswapl %edx
xorl %ebp,%eax
xorl %ebp,%edx
-.byte 102,15,58,34,216,3
+ pinsrd $3,%eax,%xmm3
leaq 3(%r8),%rax
movdqa %xmm3,16(%rsp)
-.byte 102,15,58,34,226,3
+ pinsrd $3,%edx,%xmm4
bswapl %eax
movq %r10,%rdx
leaq 4(%r8),%r10
movdqa %xmm4,32(%rsp)
xorl %ebp,%eax
bswapl %r10d
-.byte 102,15,58,34,232,3
+ pinsrd $3,%eax,%xmm5
xorl %ebp,%r10d
movdqa %xmm5,48(%rsp)
leaq 5(%r8),%r9
@@ -987,163 +987,163 @@
L$ctr32_loop8:
addl $8,%r8d
movdqa 96(%rsp),%xmm8
-.byte 102,15,56,220,209
+ aesenc %xmm1,%xmm2
movl %r8d,%r9d
movdqa 112(%rsp),%xmm9
-.byte 102,15,56,220,217
+ aesenc %xmm1,%xmm3
bswapl %r9d
movups 32-128(%rcx),%xmm0
-.byte 102,15,56,220,225
+ aesenc %xmm1,%xmm4
xorl %ebp,%r9d
nop
-.byte 102,15,56,220,233
+ aesenc %xmm1,%xmm5
movl %r9d,0+12(%rsp)
leaq 1(%r8),%r9
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
+ aesenc %xmm1,%xmm6
+ aesenc %xmm1,%xmm7
+ aesenc %xmm1,%xmm8
+ aesenc %xmm1,%xmm9
movups 48-128(%rcx),%xmm1
bswapl %r9d
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
+ aesenc %xmm0,%xmm2
+ aesenc %xmm0,%xmm3
xorl %ebp,%r9d
.byte 0x66,0x90
-.byte 102,15,56,220,224
-.byte 102,15,56,220,232
+ aesenc %xmm0,%xmm4
+ aesenc %xmm0,%xmm5
movl %r9d,16+12(%rsp)
leaq 2(%r8),%r9
-.byte 102,15,56,220,240
-.byte 102,15,56,220,248
-.byte 102,68,15,56,220,192
-.byte 102,68,15,56,220,200
+ aesenc %xmm0,%xmm6
+ aesenc %xmm0,%xmm7
+ aesenc %xmm0,%xmm8
+ aesenc %xmm0,%xmm9
movups 64-128(%rcx),%xmm0
bswapl %r9d
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
xorl %ebp,%r9d
.byte 0x66,0x90
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
+ aesenc %xmm1,%xmm4
+ aesenc %xmm1,%xmm5
movl %r9d,32+12(%rsp)
leaq 3(%r8),%r9
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
+ aesenc %xmm1,%xmm6
+ aesenc %xmm1,%xmm7
+ aesenc %xmm1,%xmm8
+ aesenc %xmm1,%xmm9
movups 80-128(%rcx),%xmm1
bswapl %r9d
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
+ aesenc %xmm0,%xmm2
+ aesenc %xmm0,%xmm3
xorl %ebp,%r9d
.byte 0x66,0x90
-.byte 102,15,56,220,224
-.byte 102,15,56,220,232
+ aesenc %xmm0,%xmm4
+ aesenc %xmm0,%xmm5
movl %r9d,48+12(%rsp)
leaq 4(%r8),%r9
-.byte 102,15,56,220,240
-.byte 102,15,56,220,248
-.byte 102,68,15,56,220,192
-.byte 102,68,15,56,220,200
+ aesenc %xmm0,%xmm6
+ aesenc %xmm0,%xmm7
+ aesenc %xmm0,%xmm8
+ aesenc %xmm0,%xmm9
movups 96-128(%rcx),%xmm0
bswapl %r9d
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
xorl %ebp,%r9d
.byte 0x66,0x90
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
+ aesenc %xmm1,%xmm4
+ aesenc %xmm1,%xmm5
movl %r9d,64+12(%rsp)
leaq 5(%r8),%r9
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
+ aesenc %xmm1,%xmm6
+ aesenc %xmm1,%xmm7
+ aesenc %xmm1,%xmm8
+ aesenc %xmm1,%xmm9
movups 112-128(%rcx),%xmm1
bswapl %r9d
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
+ aesenc %xmm0,%xmm2
+ aesenc %xmm0,%xmm3
xorl %ebp,%r9d
.byte 0x66,0x90
-.byte 102,15,56,220,224
-.byte 102,15,56,220,232
+ aesenc %xmm0,%xmm4
+ aesenc %xmm0,%xmm5
movl %r9d,80+12(%rsp)
leaq 6(%r8),%r9
-.byte 102,15,56,220,240
-.byte 102,15,56,220,248
-.byte 102,68,15,56,220,192
-.byte 102,68,15,56,220,200
+ aesenc %xmm0,%xmm6
+ aesenc %xmm0,%xmm7
+ aesenc %xmm0,%xmm8
+ aesenc %xmm0,%xmm9
movups 128-128(%rcx),%xmm0
bswapl %r9d
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
xorl %ebp,%r9d
.byte 0x66,0x90
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
+ aesenc %xmm1,%xmm4
+ aesenc %xmm1,%xmm5
movl %r9d,96+12(%rsp)
leaq 7(%r8),%r9
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
+ aesenc %xmm1,%xmm6
+ aesenc %xmm1,%xmm7
+ aesenc %xmm1,%xmm8
+ aesenc %xmm1,%xmm9
movups 144-128(%rcx),%xmm1
bswapl %r9d
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
-.byte 102,15,56,220,224
+ aesenc %xmm0,%xmm2
+ aesenc %xmm0,%xmm3
+ aesenc %xmm0,%xmm4
xorl %ebp,%r9d
movdqu 0(%rdi),%xmm10
-.byte 102,15,56,220,232
+ aesenc %xmm0,%xmm5
movl %r9d,112+12(%rsp)
cmpl $11,%eax
-.byte 102,15,56,220,240
-.byte 102,15,56,220,248
-.byte 102,68,15,56,220,192
-.byte 102,68,15,56,220,200
+ aesenc %xmm0,%xmm6
+ aesenc %xmm0,%xmm7
+ aesenc %xmm0,%xmm8
+ aesenc %xmm0,%xmm9
movups 160-128(%rcx),%xmm0
jb L$ctr32_enc_done
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
+ aesenc %xmm1,%xmm4
+ aesenc %xmm1,%xmm5
+ aesenc %xmm1,%xmm6
+ aesenc %xmm1,%xmm7
+ aesenc %xmm1,%xmm8
+ aesenc %xmm1,%xmm9
movups 176-128(%rcx),%xmm1
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
-.byte 102,15,56,220,224
-.byte 102,15,56,220,232
-.byte 102,15,56,220,240
-.byte 102,15,56,220,248
-.byte 102,68,15,56,220,192
-.byte 102,68,15,56,220,200
+ aesenc %xmm0,%xmm2
+ aesenc %xmm0,%xmm3
+ aesenc %xmm0,%xmm4
+ aesenc %xmm0,%xmm5
+ aesenc %xmm0,%xmm6
+ aesenc %xmm0,%xmm7
+ aesenc %xmm0,%xmm8
+ aesenc %xmm0,%xmm9
movups 192-128(%rcx),%xmm0
je L$ctr32_enc_done
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
+ aesenc %xmm1,%xmm4
+ aesenc %xmm1,%xmm5
+ aesenc %xmm1,%xmm6
+ aesenc %xmm1,%xmm7
+ aesenc %xmm1,%xmm8
+ aesenc %xmm1,%xmm9
movups 208-128(%rcx),%xmm1
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
-.byte 102,15,56,220,224
-.byte 102,15,56,220,232
-.byte 102,15,56,220,240
-.byte 102,15,56,220,248
-.byte 102,68,15,56,220,192
-.byte 102,68,15,56,220,200
+ aesenc %xmm0,%xmm2
+ aesenc %xmm0,%xmm3
+ aesenc %xmm0,%xmm4
+ aesenc %xmm0,%xmm5
+ aesenc %xmm0,%xmm6
+ aesenc %xmm0,%xmm7
+ aesenc %xmm0,%xmm8
+ aesenc %xmm0,%xmm9
movups 224-128(%rcx),%xmm0
jmp L$ctr32_enc_done
@@ -1162,35 +1162,35 @@
prefetcht0 448(%rdi)
prefetcht0 512(%rdi)
pxor %xmm0,%xmm15
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
+ aesenc %xmm1,%xmm4
+ aesenc %xmm1,%xmm5
+ aesenc %xmm1,%xmm6
+ aesenc %xmm1,%xmm7
+ aesenc %xmm1,%xmm8
+ aesenc %xmm1,%xmm9
movdqu 96(%rdi),%xmm1
leaq 128(%rdi),%rdi
-.byte 102,65,15,56,221,210
+ aesenclast %xmm10,%xmm2
pxor %xmm0,%xmm1
movdqu 112-128(%rdi),%xmm10
-.byte 102,65,15,56,221,219
+ aesenclast %xmm11,%xmm3
pxor %xmm0,%xmm10
movdqa 0(%rsp),%xmm11
-.byte 102,65,15,56,221,228
-.byte 102,65,15,56,221,237
+ aesenclast %xmm12,%xmm4
+ aesenclast %xmm13,%xmm5
movdqa 16(%rsp),%xmm12
movdqa 32(%rsp),%xmm13
-.byte 102,65,15,56,221,246
-.byte 102,65,15,56,221,255
+ aesenclast %xmm14,%xmm6
+ aesenclast %xmm15,%xmm7
movdqa 48(%rsp),%xmm14
movdqa 64(%rsp),%xmm15
-.byte 102,68,15,56,221,193
+ aesenclast %xmm1,%xmm8
movdqa 80(%rsp),%xmm0
movups 16-128(%rcx),%xmm1
-.byte 102,69,15,56,221,202
+ aesenclast %xmm10,%xmm9
movups %xmm2,(%rsi)
movdqa %xmm11,%xmm2
@@ -1229,19 +1229,19 @@
pxor %xmm9,%xmm9
movups 16(%rcx),%xmm0
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
leaq 32-16(%rcx,%rax,1),%rcx
negq %rax
-.byte 102,15,56,220,225
+ aesenc %xmm1,%xmm4
addq $16,%rax
movups (%rdi),%xmm10
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
+ aesenc %xmm1,%xmm5
+ aesenc %xmm1,%xmm6
movups 16(%rdi),%xmm11
movups 32(%rdi),%xmm12
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
+ aesenc %xmm1,%xmm7
+ aesenc %xmm1,%xmm8
call L$enc_loop8_enter
@@ -1272,20 +1272,20 @@
.p2align 5
L$ctr32_loop4:
-.byte 102,15,56,220,209
+ aesenc %xmm1,%xmm2
leaq 16(%rcx),%rcx
decl %eax
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
+ aesenc %xmm1,%xmm3
+ aesenc %xmm1,%xmm4
+ aesenc %xmm1,%xmm5
movups (%rcx),%xmm1
jnz L$ctr32_loop4
-.byte 102,15,56,221,209
-.byte 102,15,56,221,217
+ aesenclast %xmm1,%xmm2
+ aesenclast %xmm1,%xmm3
movups (%rdi),%xmm10
movups 16(%rdi),%xmm11
-.byte 102,15,56,221,225
-.byte 102,15,56,221,233
+ aesenclast %xmm1,%xmm4
+ aesenclast %xmm1,%xmm5
movups 32(%rdi),%xmm12
movups 48(%rdi),%xmm13
@@ -1301,16 +1301,16 @@
.p2align 5
L$ctr32_loop3:
-.byte 102,15,56,220,209
+ aesenc %xmm1,%xmm2
leaq 16(%rcx),%rcx
decl %eax
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
+ aesenc %xmm1,%xmm3
+ aesenc %xmm1,%xmm4
movups (%rcx),%xmm1
jnz L$ctr32_loop3
-.byte 102,15,56,221,209
-.byte 102,15,56,221,217
-.byte 102,15,56,221,225
+ aesenclast %xmm1,%xmm2
+ aesenclast %xmm1,%xmm3
+ aesenclast %xmm1,%xmm4
movups (%rdi),%xmm10
xorps %xmm10,%xmm2
@@ -1393,12 +1393,12 @@
leaq 32(%rcx),%rcx
xorps %xmm3,%xmm2
L$oop_enc1_6:
-.byte 102,15,56,220,209
+ aesenc %xmm1,%xmm2
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz L$oop_enc1_6
-.byte 102,15,56,221,209
+ aesenclast %xmm1,%xmm2
movl %r10d,%eax
movq %r11,%rcx
movups %xmm2,0(%rsi)
@@ -1444,12 +1444,12 @@
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
L$oop_dec1_7:
-.byte 102,15,56,222,209
+ aesdec %xmm1,%xmm2
decl %r10d
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz L$oop_dec1_7
-.byte 102,15,56,223,209
+ aesdeclast %xmm1,%xmm2
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
movdqu %xmm4,(%r8)
@@ -1508,166 +1508,166 @@
pxor %xmm0,%xmm7
pxor %xmm0,%xmm8
-.byte 102,15,56,222,209
+ aesdec %xmm1,%xmm2
pxor %xmm0,%xmm9
movups 32-112(%rcx),%xmm0
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,68,15,56,222,193
+ aesdec %xmm1,%xmm3
+ aesdec %xmm1,%xmm4
+ aesdec %xmm1,%xmm5
+ aesdec %xmm1,%xmm6
+ aesdec %xmm1,%xmm7
+ aesdec %xmm1,%xmm8
adcq $0,%rbp
andq $128,%rbp
-.byte 102,68,15,56,222,201
+ aesdec %xmm1,%xmm9
addq %rdi,%rbp
movups 48-112(%rcx),%xmm1
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
-.byte 102,15,56,222,232
-.byte 102,15,56,222,240
-.byte 102,15,56,222,248
-.byte 102,68,15,56,222,192
-.byte 102,68,15,56,222,200
+ aesdec %xmm0,%xmm2
+ aesdec %xmm0,%xmm3
+ aesdec %xmm0,%xmm4
+ aesdec %xmm0,%xmm5
+ aesdec %xmm0,%xmm6
+ aesdec %xmm0,%xmm7
+ aesdec %xmm0,%xmm8
+ aesdec %xmm0,%xmm9
movups 64-112(%rcx),%xmm0
nop
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
+ aesdec %xmm1,%xmm4
+ aesdec %xmm1,%xmm5
+ aesdec %xmm1,%xmm6
+ aesdec %xmm1,%xmm7
+ aesdec %xmm1,%xmm8
+ aesdec %xmm1,%xmm9
movups 80-112(%rcx),%xmm1
nop
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
-.byte 102,15,56,222,232
-.byte 102,15,56,222,240
-.byte 102,15,56,222,248
-.byte 102,68,15,56,222,192
-.byte 102,68,15,56,222,200
+ aesdec %xmm0,%xmm2
+ aesdec %xmm0,%xmm3
+ aesdec %xmm0,%xmm4
+ aesdec %xmm0,%xmm5
+ aesdec %xmm0,%xmm6
+ aesdec %xmm0,%xmm7
+ aesdec %xmm0,%xmm8
+ aesdec %xmm0,%xmm9
movups 96-112(%rcx),%xmm0
nop
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
+ aesdec %xmm1,%xmm4
+ aesdec %xmm1,%xmm5
+ aesdec %xmm1,%xmm6
+ aesdec %xmm1,%xmm7
+ aesdec %xmm1,%xmm8
+ aesdec %xmm1,%xmm9
movups 112-112(%rcx),%xmm1
nop
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
-.byte 102,15,56,222,232
-.byte 102,15,56,222,240
-.byte 102,15,56,222,248
-.byte 102,68,15,56,222,192
-.byte 102,68,15,56,222,200
+ aesdec %xmm0,%xmm2
+ aesdec %xmm0,%xmm3
+ aesdec %xmm0,%xmm4
+ aesdec %xmm0,%xmm5
+ aesdec %xmm0,%xmm6
+ aesdec %xmm0,%xmm7
+ aesdec %xmm0,%xmm8
+ aesdec %xmm0,%xmm9
movups 128-112(%rcx),%xmm0
nop
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
+ aesdec %xmm1,%xmm4
+ aesdec %xmm1,%xmm5
+ aesdec %xmm1,%xmm6
+ aesdec %xmm1,%xmm7
+ aesdec %xmm1,%xmm8
+ aesdec %xmm1,%xmm9
movups 144-112(%rcx),%xmm1
cmpl $11,%eax
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
-.byte 102,15,56,222,232
-.byte 102,15,56,222,240
-.byte 102,15,56,222,248
-.byte 102,68,15,56,222,192
-.byte 102,68,15,56,222,200
+ aesdec %xmm0,%xmm2
+ aesdec %xmm0,%xmm3
+ aesdec %xmm0,%xmm4
+ aesdec %xmm0,%xmm5
+ aesdec %xmm0,%xmm6
+ aesdec %xmm0,%xmm7
+ aesdec %xmm0,%xmm8
+ aesdec %xmm0,%xmm9
movups 160-112(%rcx),%xmm0
jb L$cbc_dec_done
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
+ aesdec %xmm1,%xmm4
+ aesdec %xmm1,%xmm5
+ aesdec %xmm1,%xmm6
+ aesdec %xmm1,%xmm7
+ aesdec %xmm1,%xmm8
+ aesdec %xmm1,%xmm9
movups 176-112(%rcx),%xmm1
nop
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
-.byte 102,15,56,222,232
-.byte 102,15,56,222,240
-.byte 102,15,56,222,248
-.byte 102,68,15,56,222,192
-.byte 102,68,15,56,222,200
+ aesdec %xmm0,%xmm2
+ aesdec %xmm0,%xmm3
+ aesdec %xmm0,%xmm4
+ aesdec %xmm0,%xmm5
+ aesdec %xmm0,%xmm6
+ aesdec %xmm0,%xmm7
+ aesdec %xmm0,%xmm8
+ aesdec %xmm0,%xmm9
movups 192-112(%rcx),%xmm0
je L$cbc_dec_done
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
+ aesdec %xmm1,%xmm4
+ aesdec %xmm1,%xmm5
+ aesdec %xmm1,%xmm6
+ aesdec %xmm1,%xmm7
+ aesdec %xmm1,%xmm8
+ aesdec %xmm1,%xmm9
movups 208-112(%rcx),%xmm1
nop
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
-.byte 102,15,56,222,232
-.byte 102,15,56,222,240
-.byte 102,15,56,222,248
-.byte 102,68,15,56,222,192
-.byte 102,68,15,56,222,200
+ aesdec %xmm0,%xmm2
+ aesdec %xmm0,%xmm3
+ aesdec %xmm0,%xmm4
+ aesdec %xmm0,%xmm5
+ aesdec %xmm0,%xmm6
+ aesdec %xmm0,%xmm7
+ aesdec %xmm0,%xmm8
+ aesdec %xmm0,%xmm9
movups 224-112(%rcx),%xmm0
jmp L$cbc_dec_done
.p2align 4
L$cbc_dec_done:
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
pxor %xmm0,%xmm10
pxor %xmm0,%xmm11
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
+ aesdec %xmm1,%xmm4
+ aesdec %xmm1,%xmm5
pxor %xmm0,%xmm12
pxor %xmm0,%xmm13
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
+ aesdec %xmm1,%xmm6
+ aesdec %xmm1,%xmm7
pxor %xmm0,%xmm14
pxor %xmm0,%xmm15
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
+ aesdec %xmm1,%xmm8
+ aesdec %xmm1,%xmm9
movdqu 80(%rdi),%xmm1
-.byte 102,65,15,56,223,210
+ aesdeclast %xmm10,%xmm2
movdqu 96(%rdi),%xmm10
pxor %xmm0,%xmm1
-.byte 102,65,15,56,223,219
+ aesdeclast %xmm11,%xmm3
pxor %xmm0,%xmm10
movdqu 112(%rdi),%xmm0
-.byte 102,65,15,56,223,228
+ aesdeclast %xmm12,%xmm4
leaq 128(%rdi),%rdi
movdqu 0(%rbp),%xmm11
-.byte 102,65,15,56,223,237
-.byte 102,65,15,56,223,246
+ aesdeclast %xmm13,%xmm5
+ aesdeclast %xmm14,%xmm6
movdqu 16(%rbp),%xmm12
movdqu 32(%rbp),%xmm13
-.byte 102,65,15,56,223,255
-.byte 102,68,15,56,223,193
+ aesdeclast %xmm15,%xmm7
+ aesdeclast %xmm1,%xmm8
movdqu 48(%rbp),%xmm14
movdqu 64(%rbp),%xmm15
-.byte 102,69,15,56,223,202
+ aesdeclast %xmm10,%xmm9
movdqa %xmm0,%xmm10
movdqu 80(%rbp),%xmm1
movups -112(%rcx),%xmm0
@@ -1811,12 +1811,12 @@
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
L$oop_dec1_8:
-.byte 102,15,56,222,209
+ aesdec %xmm1,%xmm2
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz L$oop_dec1_8
-.byte 102,15,56,223,209
+ aesdeclast %xmm1,%xmm2
xorps %xmm10,%xmm2
movaps %xmm11,%xmm10
jmp L$cbc_dec_tail_collected
@@ -1927,8 +1927,8 @@
L$dec_key_inverse:
movups (%rdi),%xmm0
movups (%rdx),%xmm1
-.byte 102,15,56,219,192
-.byte 102,15,56,219,201
+ aesimc %xmm0,%xmm0
+ aesimc %xmm1,%xmm1
leaq 16(%rdi),%rdi
leaq -16(%rdx),%rdx
movups %xmm0,16(%rdx)
@@ -1937,7 +1937,7 @@
ja L$dec_key_inverse
movups (%rdi),%xmm0
-.byte 102,15,56,219,192
+ aesimc %xmm0,%xmm0
pxor %xmm1,%xmm1
movups %xmm0,(%rdx)
pxor %xmm0,%xmm0
@@ -1973,25 +1973,25 @@
movl $9,%esi
movups %xmm0,(%rdx)
-.byte 102,15,58,223,200,1
+ aeskeygenassist $0x1,%xmm0,%xmm1
call L$key_expansion_128_cold
-.byte 102,15,58,223,200,2
+ aeskeygenassist $0x2,%xmm0,%xmm1
call L$key_expansion_128
-.byte 102,15,58,223,200,4
+ aeskeygenassist $0x4,%xmm0,%xmm1
call L$key_expansion_128
-.byte 102,15,58,223,200,8
+ aeskeygenassist $0x8,%xmm0,%xmm1
call L$key_expansion_128
-.byte 102,15,58,223,200,16
+ aeskeygenassist $0x10,%xmm0,%xmm1
call L$key_expansion_128
-.byte 102,15,58,223,200,32
+ aeskeygenassist $0x20,%xmm0,%xmm1
call L$key_expansion_128
-.byte 102,15,58,223,200,64
+ aeskeygenassist $0x40,%xmm0,%xmm1
call L$key_expansion_128
-.byte 102,15,58,223,200,128
+ aeskeygenassist $0x80,%xmm0,%xmm1
call L$key_expansion_128
-.byte 102,15,58,223,200,27
+ aeskeygenassist $0x1b,%xmm0,%xmm1
call L$key_expansion_128
-.byte 102,15,58,223,200,54
+ aeskeygenassist $0x36,%xmm0,%xmm1
call L$key_expansion_128
movups %xmm0,(%rax)
movl %esi,80(%rax)
@@ -2004,21 +2004,21 @@
movl $11,%esi
movups %xmm0,(%rdx)
-.byte 102,15,58,223,202,1
+ aeskeygenassist $0x1,%xmm2,%xmm1
call L$key_expansion_192a_cold
-.byte 102,15,58,223,202,2
+ aeskeygenassist $0x2,%xmm2,%xmm1
call L$key_expansion_192b
-.byte 102,15,58,223,202,4
+ aeskeygenassist $0x4,%xmm2,%xmm1
call L$key_expansion_192a
-.byte 102,15,58,223,202,8
+ aeskeygenassist $0x8,%xmm2,%xmm1
call L$key_expansion_192b
-.byte 102,15,58,223,202,16
+ aeskeygenassist $0x10,%xmm2,%xmm1
call L$key_expansion_192a
-.byte 102,15,58,223,202,32
+ aeskeygenassist $0x20,%xmm2,%xmm1
call L$key_expansion_192b
-.byte 102,15,58,223,202,64
+ aeskeygenassist $0x40,%xmm2,%xmm1
call L$key_expansion_192a
-.byte 102,15,58,223,202,128
+ aeskeygenassist $0x80,%xmm2,%xmm1
call L$key_expansion_192b
movups %xmm0,(%rax)
movl %esi,48(%rax)
@@ -2033,31 +2033,31 @@
movups %xmm0,(%rdx)
movups %xmm2,16(%rdx)
-.byte 102,15,58,223,202,1
+ aeskeygenassist $0x1,%xmm2,%xmm1
call L$key_expansion_256a_cold
-.byte 102,15,58,223,200,1
+ aeskeygenassist $0x1,%xmm0,%xmm1
call L$key_expansion_256b
-.byte 102,15,58,223,202,2
+ aeskeygenassist $0x2,%xmm2,%xmm1
call L$key_expansion_256a
-.byte 102,15,58,223,200,2
+ aeskeygenassist $0x2,%xmm0,%xmm1
call L$key_expansion_256b
-.byte 102,15,58,223,202,4
+ aeskeygenassist $0x4,%xmm2,%xmm1
call L$key_expansion_256a
-.byte 102,15,58,223,200,4
+ aeskeygenassist $0x4,%xmm0,%xmm1
call L$key_expansion_256b
-.byte 102,15,58,223,202,8
+ aeskeygenassist $0x8,%xmm2,%xmm1
call L$key_expansion_256a
-.byte 102,15,58,223,200,8
+ aeskeygenassist $0x8,%xmm0,%xmm1
call L$key_expansion_256b
-.byte 102,15,58,223,202,16
+ aeskeygenassist $0x10,%xmm2,%xmm1
call L$key_expansion_256a
-.byte 102,15,58,223,200,16
+ aeskeygenassist $0x10,%xmm0,%xmm1
call L$key_expansion_256b
-.byte 102,15,58,223,202,32
+ aeskeygenassist $0x20,%xmm2,%xmm1
call L$key_expansion_256a
-.byte 102,15,58,223,200,32
+ aeskeygenassist $0x20,%xmm0,%xmm1
call L$key_expansion_256b
-.byte 102,15,58,223,202,64
+ aeskeygenassist $0x40,%xmm2,%xmm1
call L$key_expansion_256a
movups %xmm0,(%rax)
movl %esi,16(%rax)
@@ -2195,8 +2195,8 @@
.p2align 4
L$oop_key128:
-.byte 102,15,56,0,197
-.byte 102,15,56,221,196
+ pshufb %xmm5,%xmm0
+ aesenclast %xmm4,%xmm0
pslld $1,%xmm4
leaq 16(%rax),%rax
@@ -2217,8 +2217,8 @@
movdqa L$key_rcon1b(%rip),%xmm4
-.byte 102,15,56,0,197
-.byte 102,15,56,221,196
+ pshufb %xmm5,%xmm0
+ aesenclast %xmm4,%xmm0
pslld $1,%xmm4
movdqa %xmm2,%xmm3
@@ -2233,8 +2233,8 @@
movdqu %xmm0,(%rax)
movdqa %xmm0,%xmm2
-.byte 102,15,56,0,197
-.byte 102,15,56,221,196
+ pshufb %xmm5,%xmm0
+ aesenclast %xmm4,%xmm0
movdqa %xmm2,%xmm3
pslldq $4,%xmm2
@@ -2265,8 +2265,8 @@
L$oop_key192:
movq %xmm2,0(%rax)
movdqa %xmm2,%xmm1
-.byte 102,15,56,0,213
-.byte 102,15,56,221,212
+ pshufb %xmm5,%xmm2
+ aesenclast %xmm4,%xmm2
pslld $1,%xmm4
leaq 24(%rax),%rax
@@ -2309,8 +2309,8 @@
.p2align 4
L$oop_key256:
-.byte 102,15,56,0,213
-.byte 102,15,56,221,212
+ pshufb %xmm5,%xmm2
+ aesenclast %xmm4,%xmm2
movdqa %xmm0,%xmm3
pslldq $4,%xmm0
@@ -2329,7 +2329,7 @@
pshufd $0xff,%xmm0,%xmm2
pxor %xmm3,%xmm3
-.byte 102,15,56,221,211
+ aesenclast %xmm3,%xmm2
movdqa %xmm1,%xmm3
pslldq $4,%xmm1
diff --git a/gen/bcm/aesni-x86_64-linux.S b/gen/bcm/aesni-x86_64-linux.S
index bedd98b..4bce582 100644
--- a/gen/bcm/aesni-x86_64-linux.S
+++ b/gen/bcm/aesni-x86_64-linux.S
@@ -24,12 +24,12 @@
leaq 32(%rdx),%rdx
xorps %xmm0,%xmm2
.Loop_enc1_1:
-.byte 102,15,56,220,209
+ aesenc %xmm1,%xmm2
decl %eax
movups (%rdx),%xmm1
leaq 16(%rdx),%rdx
jnz .Loop_enc1_1
-.byte 102,15,56,221,209
+ aesenclast %xmm1,%xmm2
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
movups %xmm2,(%rsi)
@@ -52,12 +52,12 @@
leaq 32(%rdx),%rdx
xorps %xmm0,%xmm2
.Loop_dec1_2:
-.byte 102,15,56,222,209
+ aesdec %xmm1,%xmm2
decl %eax
movups (%rdx),%xmm1
leaq 16(%rdx),%rdx
jnz .Loop_dec1_2
-.byte 102,15,56,223,209
+ aesdeclast %xmm1,%xmm2
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
movups %xmm2,(%rsi)
@@ -80,19 +80,19 @@
addq $16,%rax
.Lenc_loop2:
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
+ aesenc %xmm0,%xmm2
+ aesenc %xmm0,%xmm3
movups -16(%rcx,%rax,1),%xmm0
jnz .Lenc_loop2
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,221,208
-.byte 102,15,56,221,216
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
+ aesenclast %xmm0,%xmm2
+ aesenclast %xmm0,%xmm3
ret
.cfi_endproc
.size _aesni_encrypt2,.-_aesni_encrypt2
@@ -111,19 +111,19 @@
addq $16,%rax
.Ldec_loop2:
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
+ aesdec %xmm0,%xmm2
+ aesdec %xmm0,%xmm3
movups -16(%rcx,%rax,1),%xmm0
jnz .Ldec_loop2
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,223,208
-.byte 102,15,56,223,216
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
+ aesdeclast %xmm0,%xmm2
+ aesdeclast %xmm0,%xmm3
ret
.cfi_endproc
.size _aesni_decrypt2,.-_aesni_decrypt2
@@ -143,23 +143,23 @@
addq $16,%rax
.Lenc_loop3:
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
+ aesenc %xmm1,%xmm4
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
-.byte 102,15,56,220,224
+ aesenc %xmm0,%xmm2
+ aesenc %xmm0,%xmm3
+ aesenc %xmm0,%xmm4
movups -16(%rcx,%rax,1),%xmm0
jnz .Lenc_loop3
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,221,208
-.byte 102,15,56,221,216
-.byte 102,15,56,221,224
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
+ aesenc %xmm1,%xmm4
+ aesenclast %xmm0,%xmm2
+ aesenclast %xmm0,%xmm3
+ aesenclast %xmm0,%xmm4
ret
.cfi_endproc
.size _aesni_encrypt3,.-_aesni_encrypt3
@@ -179,23 +179,23 @@
addq $16,%rax
.Ldec_loop3:
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
+ aesdec %xmm1,%xmm4
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
+ aesdec %xmm0,%xmm2
+ aesdec %xmm0,%xmm3
+ aesdec %xmm0,%xmm4
movups -16(%rcx,%rax,1),%xmm0
jnz .Ldec_loop3
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,223,208
-.byte 102,15,56,223,216
-.byte 102,15,56,223,224
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
+ aesdec %xmm1,%xmm4
+ aesdeclast %xmm0,%xmm2
+ aesdeclast %xmm0,%xmm3
+ aesdeclast %xmm0,%xmm4
ret
.cfi_endproc
.size _aesni_decrypt3,.-_aesni_decrypt3
@@ -217,27 +217,27 @@
addq $16,%rax
.Lenc_loop4:
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
+ aesenc %xmm1,%xmm4
+ aesenc %xmm1,%xmm5
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
-.byte 102,15,56,220,224
-.byte 102,15,56,220,232
+ aesenc %xmm0,%xmm2
+ aesenc %xmm0,%xmm3
+ aesenc %xmm0,%xmm4
+ aesenc %xmm0,%xmm5
movups -16(%rcx,%rax,1),%xmm0
jnz .Lenc_loop4
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
-.byte 102,15,56,221,208
-.byte 102,15,56,221,216
-.byte 102,15,56,221,224
-.byte 102,15,56,221,232
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
+ aesenc %xmm1,%xmm4
+ aesenc %xmm1,%xmm5
+ aesenclast %xmm0,%xmm2
+ aesenclast %xmm0,%xmm3
+ aesenclast %xmm0,%xmm4
+ aesenclast %xmm0,%xmm5
ret
.cfi_endproc
.size _aesni_encrypt4,.-_aesni_encrypt4
@@ -259,27 +259,27 @@
addq $16,%rax
.Ldec_loop4:
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
+ aesdec %xmm1,%xmm4
+ aesdec %xmm1,%xmm5
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
-.byte 102,15,56,222,232
+ aesdec %xmm0,%xmm2
+ aesdec %xmm0,%xmm3
+ aesdec %xmm0,%xmm4
+ aesdec %xmm0,%xmm5
movups -16(%rcx,%rax,1),%xmm0
jnz .Ldec_loop4
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,223,208
-.byte 102,15,56,223,216
-.byte 102,15,56,223,224
-.byte 102,15,56,223,232
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
+ aesdec %xmm1,%xmm4
+ aesdec %xmm1,%xmm5
+ aesdeclast %xmm0,%xmm2
+ aesdeclast %xmm0,%xmm3
+ aesdeclast %xmm0,%xmm4
+ aesdeclast %xmm0,%xmm5
ret
.cfi_endproc
.size _aesni_decrypt4,.-_aesni_decrypt4
@@ -293,49 +293,49 @@
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
-.byte 102,15,56,220,209
+ aesenc %xmm1,%xmm2
leaq 32(%rcx,%rax,1),%rcx
negq %rax
-.byte 102,15,56,220,217
+ aesenc %xmm1,%xmm3
pxor %xmm0,%xmm5
pxor %xmm0,%xmm6
-.byte 102,15,56,220,225
+ aesenc %xmm1,%xmm4
pxor %xmm0,%xmm7
movups (%rcx,%rax,1),%xmm0
addq $16,%rax
jmp .Lenc_loop6_enter
.align 16
.Lenc_loop6:
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
+ aesenc %xmm1,%xmm4
.Lenc_loop6_enter:
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
+ aesenc %xmm1,%xmm5
+ aesenc %xmm1,%xmm6
+ aesenc %xmm1,%xmm7
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
-.byte 102,15,56,220,224
-.byte 102,15,56,220,232
-.byte 102,15,56,220,240
-.byte 102,15,56,220,248
+ aesenc %xmm0,%xmm2
+ aesenc %xmm0,%xmm3
+ aesenc %xmm0,%xmm4
+ aesenc %xmm0,%xmm5
+ aesenc %xmm0,%xmm6
+ aesenc %xmm0,%xmm7
movups -16(%rcx,%rax,1),%xmm0
jnz .Lenc_loop6
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,15,56,221,208
-.byte 102,15,56,221,216
-.byte 102,15,56,221,224
-.byte 102,15,56,221,232
-.byte 102,15,56,221,240
-.byte 102,15,56,221,248
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
+ aesenc %xmm1,%xmm4
+ aesenc %xmm1,%xmm5
+ aesenc %xmm1,%xmm6
+ aesenc %xmm1,%xmm7
+ aesenclast %xmm0,%xmm2
+ aesenclast %xmm0,%xmm3
+ aesenclast %xmm0,%xmm4
+ aesenclast %xmm0,%xmm5
+ aesenclast %xmm0,%xmm6
+ aesenclast %xmm0,%xmm7
ret
.cfi_endproc
.size _aesni_encrypt6,.-_aesni_encrypt6
@@ -349,49 +349,49 @@
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
-.byte 102,15,56,222,209
+ aesdec %xmm1,%xmm2
leaq 32(%rcx,%rax,1),%rcx
negq %rax
-.byte 102,15,56,222,217
+ aesdec %xmm1,%xmm3
pxor %xmm0,%xmm5
pxor %xmm0,%xmm6
-.byte 102,15,56,222,225
+ aesdec %xmm1,%xmm4
pxor %xmm0,%xmm7
movups (%rcx,%rax,1),%xmm0
addq $16,%rax
jmp .Ldec_loop6_enter
.align 16
.Ldec_loop6:
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
+ aesdec %xmm1,%xmm4
.Ldec_loop6_enter:
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
+ aesdec %xmm1,%xmm5
+ aesdec %xmm1,%xmm6
+ aesdec %xmm1,%xmm7
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
-.byte 102,15,56,222,232
-.byte 102,15,56,222,240
-.byte 102,15,56,222,248
+ aesdec %xmm0,%xmm2
+ aesdec %xmm0,%xmm3
+ aesdec %xmm0,%xmm4
+ aesdec %xmm0,%xmm5
+ aesdec %xmm0,%xmm6
+ aesdec %xmm0,%xmm7
movups -16(%rcx,%rax,1),%xmm0
jnz .Ldec_loop6
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,15,56,223,208
-.byte 102,15,56,223,216
-.byte 102,15,56,223,224
-.byte 102,15,56,223,232
-.byte 102,15,56,223,240
-.byte 102,15,56,223,248
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
+ aesdec %xmm1,%xmm4
+ aesdec %xmm1,%xmm5
+ aesdec %xmm1,%xmm6
+ aesdec %xmm1,%xmm7
+ aesdeclast %xmm0,%xmm2
+ aesdeclast %xmm0,%xmm3
+ aesdeclast %xmm0,%xmm4
+ aesdeclast %xmm0,%xmm5
+ aesdeclast %xmm0,%xmm6
+ aesdeclast %xmm0,%xmm7
ret
.cfi_endproc
.size _aesni_decrypt6,.-_aesni_decrypt6
@@ -409,55 +409,55 @@
pxor %xmm0,%xmm6
leaq 32(%rcx,%rax,1),%rcx
negq %rax
-.byte 102,15,56,220,209
+ aesenc %xmm1,%xmm2
pxor %xmm0,%xmm7
pxor %xmm0,%xmm8
-.byte 102,15,56,220,217
+ aesenc %xmm1,%xmm3
pxor %xmm0,%xmm9
movups (%rcx,%rax,1),%xmm0
addq $16,%rax
jmp .Lenc_loop8_inner
.align 16
.Lenc_loop8:
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
.Lenc_loop8_inner:
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
+ aesenc %xmm1,%xmm4
+ aesenc %xmm1,%xmm5
+ aesenc %xmm1,%xmm6
+ aesenc %xmm1,%xmm7
+ aesenc %xmm1,%xmm8
+ aesenc %xmm1,%xmm9
.Lenc_loop8_enter:
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
-.byte 102,15,56,220,224
-.byte 102,15,56,220,232
-.byte 102,15,56,220,240
-.byte 102,15,56,220,248
-.byte 102,68,15,56,220,192
-.byte 102,68,15,56,220,200
+ aesenc %xmm0,%xmm2
+ aesenc %xmm0,%xmm3
+ aesenc %xmm0,%xmm4
+ aesenc %xmm0,%xmm5
+ aesenc %xmm0,%xmm6
+ aesenc %xmm0,%xmm7
+ aesenc %xmm0,%xmm8
+ aesenc %xmm0,%xmm9
movups -16(%rcx,%rax,1),%xmm0
jnz .Lenc_loop8
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
-.byte 102,15,56,221,208
-.byte 102,15,56,221,216
-.byte 102,15,56,221,224
-.byte 102,15,56,221,232
-.byte 102,15,56,221,240
-.byte 102,15,56,221,248
-.byte 102,68,15,56,221,192
-.byte 102,68,15,56,221,200
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
+ aesenc %xmm1,%xmm4
+ aesenc %xmm1,%xmm5
+ aesenc %xmm1,%xmm6
+ aesenc %xmm1,%xmm7
+ aesenc %xmm1,%xmm8
+ aesenc %xmm1,%xmm9
+ aesenclast %xmm0,%xmm2
+ aesenclast %xmm0,%xmm3
+ aesenclast %xmm0,%xmm4
+ aesenclast %xmm0,%xmm5
+ aesenclast %xmm0,%xmm6
+ aesenclast %xmm0,%xmm7
+ aesenclast %xmm0,%xmm8
+ aesenclast %xmm0,%xmm9
ret
.cfi_endproc
.size _aesni_encrypt8,.-_aesni_encrypt8
@@ -475,55 +475,55 @@
pxor %xmm0,%xmm6
leaq 32(%rcx,%rax,1),%rcx
negq %rax
-.byte 102,15,56,222,209
+ aesdec %xmm1,%xmm2
pxor %xmm0,%xmm7
pxor %xmm0,%xmm8
-.byte 102,15,56,222,217
+ aesdec %xmm1,%xmm3
pxor %xmm0,%xmm9
movups (%rcx,%rax,1),%xmm0
addq $16,%rax
jmp .Ldec_loop8_inner
.align 16
.Ldec_loop8:
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
.Ldec_loop8_inner:
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
+ aesdec %xmm1,%xmm4
+ aesdec %xmm1,%xmm5
+ aesdec %xmm1,%xmm6
+ aesdec %xmm1,%xmm7
+ aesdec %xmm1,%xmm8
+ aesdec %xmm1,%xmm9
.Ldec_loop8_enter:
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
-.byte 102,15,56,222,232
-.byte 102,15,56,222,240
-.byte 102,15,56,222,248
-.byte 102,68,15,56,222,192
-.byte 102,68,15,56,222,200
+ aesdec %xmm0,%xmm2
+ aesdec %xmm0,%xmm3
+ aesdec %xmm0,%xmm4
+ aesdec %xmm0,%xmm5
+ aesdec %xmm0,%xmm6
+ aesdec %xmm0,%xmm7
+ aesdec %xmm0,%xmm8
+ aesdec %xmm0,%xmm9
movups -16(%rcx,%rax,1),%xmm0
jnz .Ldec_loop8
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
-.byte 102,15,56,223,208
-.byte 102,15,56,223,216
-.byte 102,15,56,223,224
-.byte 102,15,56,223,232
-.byte 102,15,56,223,240
-.byte 102,15,56,223,248
-.byte 102,68,15,56,223,192
-.byte 102,68,15,56,223,200
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
+ aesdec %xmm1,%xmm4
+ aesdec %xmm1,%xmm5
+ aesdec %xmm1,%xmm6
+ aesdec %xmm1,%xmm7
+ aesdec %xmm1,%xmm8
+ aesdec %xmm1,%xmm9
+ aesdeclast %xmm0,%xmm2
+ aesdeclast %xmm0,%xmm3
+ aesdeclast %xmm0,%xmm4
+ aesdeclast %xmm0,%xmm5
+ aesdeclast %xmm0,%xmm6
+ aesdeclast %xmm0,%xmm7
+ aesdeclast %xmm0,%xmm8
+ aesdeclast %xmm0,%xmm9
ret
.cfi_endproc
.size _aesni_decrypt8,.-_aesni_decrypt8
@@ -635,12 +635,12 @@
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
.Loop_enc1_3:
-.byte 102,15,56,220,209
+ aesenc %xmm1,%xmm2
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz .Loop_enc1_3
-.byte 102,15,56,221,209
+ aesenclast %xmm1,%xmm2
movups %xmm2,(%rsi)
jmp .Lecb_ret
.align 16
@@ -796,12 +796,12 @@
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
.Loop_dec1_4:
-.byte 102,15,56,222,209
+ aesdec %xmm1,%xmm2
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz .Loop_dec1_4
-.byte 102,15,56,223,209
+ aesdeclast %xmm1,%xmm2
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
jmp .Lecb_ret
@@ -896,12 +896,12 @@
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
.Loop_enc1_5:
-.byte 102,15,56,220,209
+ aesenc %xmm1,%xmm2
decl %edx
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz .Loop_enc1_5
-.byte 102,15,56,221,209
+ aesenclast %xmm1,%xmm2
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
xorps %xmm3,%xmm2
@@ -944,17 +944,17 @@
bswapl %edx
xorl %ebp,%eax
xorl %ebp,%edx
-.byte 102,15,58,34,216,3
+ pinsrd $3,%eax,%xmm3
leaq 3(%r8),%rax
movdqa %xmm3,16(%rsp)
-.byte 102,15,58,34,226,3
+ pinsrd $3,%edx,%xmm4
bswapl %eax
movq %r10,%rdx
leaq 4(%r8),%r10
movdqa %xmm4,32(%rsp)
xorl %ebp,%eax
bswapl %r10d
-.byte 102,15,58,34,232,3
+ pinsrd $3,%eax,%xmm5
xorl %ebp,%r10d
movdqa %xmm5,48(%rsp)
leaq 5(%r8),%r9
@@ -988,163 +988,163 @@
.Lctr32_loop8:
addl $8,%r8d
movdqa 96(%rsp),%xmm8
-.byte 102,15,56,220,209
+ aesenc %xmm1,%xmm2
movl %r8d,%r9d
movdqa 112(%rsp),%xmm9
-.byte 102,15,56,220,217
+ aesenc %xmm1,%xmm3
bswapl %r9d
movups 32-128(%rcx),%xmm0
-.byte 102,15,56,220,225
+ aesenc %xmm1,%xmm4
xorl %ebp,%r9d
nop
-.byte 102,15,56,220,233
+ aesenc %xmm1,%xmm5
movl %r9d,0+12(%rsp)
leaq 1(%r8),%r9
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
+ aesenc %xmm1,%xmm6
+ aesenc %xmm1,%xmm7
+ aesenc %xmm1,%xmm8
+ aesenc %xmm1,%xmm9
movups 48-128(%rcx),%xmm1
bswapl %r9d
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
+ aesenc %xmm0,%xmm2
+ aesenc %xmm0,%xmm3
xorl %ebp,%r9d
.byte 0x66,0x90
-.byte 102,15,56,220,224
-.byte 102,15,56,220,232
+ aesenc %xmm0,%xmm4
+ aesenc %xmm0,%xmm5
movl %r9d,16+12(%rsp)
leaq 2(%r8),%r9
-.byte 102,15,56,220,240
-.byte 102,15,56,220,248
-.byte 102,68,15,56,220,192
-.byte 102,68,15,56,220,200
+ aesenc %xmm0,%xmm6
+ aesenc %xmm0,%xmm7
+ aesenc %xmm0,%xmm8
+ aesenc %xmm0,%xmm9
movups 64-128(%rcx),%xmm0
bswapl %r9d
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
xorl %ebp,%r9d
.byte 0x66,0x90
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
+ aesenc %xmm1,%xmm4
+ aesenc %xmm1,%xmm5
movl %r9d,32+12(%rsp)
leaq 3(%r8),%r9
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
+ aesenc %xmm1,%xmm6
+ aesenc %xmm1,%xmm7
+ aesenc %xmm1,%xmm8
+ aesenc %xmm1,%xmm9
movups 80-128(%rcx),%xmm1
bswapl %r9d
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
+ aesenc %xmm0,%xmm2
+ aesenc %xmm0,%xmm3
xorl %ebp,%r9d
.byte 0x66,0x90
-.byte 102,15,56,220,224
-.byte 102,15,56,220,232
+ aesenc %xmm0,%xmm4
+ aesenc %xmm0,%xmm5
movl %r9d,48+12(%rsp)
leaq 4(%r8),%r9
-.byte 102,15,56,220,240
-.byte 102,15,56,220,248
-.byte 102,68,15,56,220,192
-.byte 102,68,15,56,220,200
+ aesenc %xmm0,%xmm6
+ aesenc %xmm0,%xmm7
+ aesenc %xmm0,%xmm8
+ aesenc %xmm0,%xmm9
movups 96-128(%rcx),%xmm0
bswapl %r9d
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
xorl %ebp,%r9d
.byte 0x66,0x90
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
+ aesenc %xmm1,%xmm4
+ aesenc %xmm1,%xmm5
movl %r9d,64+12(%rsp)
leaq 5(%r8),%r9
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
+ aesenc %xmm1,%xmm6
+ aesenc %xmm1,%xmm7
+ aesenc %xmm1,%xmm8
+ aesenc %xmm1,%xmm9
movups 112-128(%rcx),%xmm1
bswapl %r9d
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
+ aesenc %xmm0,%xmm2
+ aesenc %xmm0,%xmm3
xorl %ebp,%r9d
.byte 0x66,0x90
-.byte 102,15,56,220,224
-.byte 102,15,56,220,232
+ aesenc %xmm0,%xmm4
+ aesenc %xmm0,%xmm5
movl %r9d,80+12(%rsp)
leaq 6(%r8),%r9
-.byte 102,15,56,220,240
-.byte 102,15,56,220,248
-.byte 102,68,15,56,220,192
-.byte 102,68,15,56,220,200
+ aesenc %xmm0,%xmm6
+ aesenc %xmm0,%xmm7
+ aesenc %xmm0,%xmm8
+ aesenc %xmm0,%xmm9
movups 128-128(%rcx),%xmm0
bswapl %r9d
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
xorl %ebp,%r9d
.byte 0x66,0x90
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
+ aesenc %xmm1,%xmm4
+ aesenc %xmm1,%xmm5
movl %r9d,96+12(%rsp)
leaq 7(%r8),%r9
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
+ aesenc %xmm1,%xmm6
+ aesenc %xmm1,%xmm7
+ aesenc %xmm1,%xmm8
+ aesenc %xmm1,%xmm9
movups 144-128(%rcx),%xmm1
bswapl %r9d
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
-.byte 102,15,56,220,224
+ aesenc %xmm0,%xmm2
+ aesenc %xmm0,%xmm3
+ aesenc %xmm0,%xmm4
xorl %ebp,%r9d
movdqu 0(%rdi),%xmm10
-.byte 102,15,56,220,232
+ aesenc %xmm0,%xmm5
movl %r9d,112+12(%rsp)
cmpl $11,%eax
-.byte 102,15,56,220,240
-.byte 102,15,56,220,248
-.byte 102,68,15,56,220,192
-.byte 102,68,15,56,220,200
+ aesenc %xmm0,%xmm6
+ aesenc %xmm0,%xmm7
+ aesenc %xmm0,%xmm8
+ aesenc %xmm0,%xmm9
movups 160-128(%rcx),%xmm0
jb .Lctr32_enc_done
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
+ aesenc %xmm1,%xmm4
+ aesenc %xmm1,%xmm5
+ aesenc %xmm1,%xmm6
+ aesenc %xmm1,%xmm7
+ aesenc %xmm1,%xmm8
+ aesenc %xmm1,%xmm9
movups 176-128(%rcx),%xmm1
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
-.byte 102,15,56,220,224
-.byte 102,15,56,220,232
-.byte 102,15,56,220,240
-.byte 102,15,56,220,248
-.byte 102,68,15,56,220,192
-.byte 102,68,15,56,220,200
+ aesenc %xmm0,%xmm2
+ aesenc %xmm0,%xmm3
+ aesenc %xmm0,%xmm4
+ aesenc %xmm0,%xmm5
+ aesenc %xmm0,%xmm6
+ aesenc %xmm0,%xmm7
+ aesenc %xmm0,%xmm8
+ aesenc %xmm0,%xmm9
movups 192-128(%rcx),%xmm0
je .Lctr32_enc_done
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
+ aesenc %xmm1,%xmm4
+ aesenc %xmm1,%xmm5
+ aesenc %xmm1,%xmm6
+ aesenc %xmm1,%xmm7
+ aesenc %xmm1,%xmm8
+ aesenc %xmm1,%xmm9
movups 208-128(%rcx),%xmm1
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
-.byte 102,15,56,220,224
-.byte 102,15,56,220,232
-.byte 102,15,56,220,240
-.byte 102,15,56,220,248
-.byte 102,68,15,56,220,192
-.byte 102,68,15,56,220,200
+ aesenc %xmm0,%xmm2
+ aesenc %xmm0,%xmm3
+ aesenc %xmm0,%xmm4
+ aesenc %xmm0,%xmm5
+ aesenc %xmm0,%xmm6
+ aesenc %xmm0,%xmm7
+ aesenc %xmm0,%xmm8
+ aesenc %xmm0,%xmm9
movups 224-128(%rcx),%xmm0
jmp .Lctr32_enc_done
@@ -1163,35 +1163,35 @@
prefetcht0 448(%rdi)
prefetcht0 512(%rdi)
pxor %xmm0,%xmm15
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
+ aesenc %xmm1,%xmm4
+ aesenc %xmm1,%xmm5
+ aesenc %xmm1,%xmm6
+ aesenc %xmm1,%xmm7
+ aesenc %xmm1,%xmm8
+ aesenc %xmm1,%xmm9
movdqu 96(%rdi),%xmm1
leaq 128(%rdi),%rdi
-.byte 102,65,15,56,221,210
+ aesenclast %xmm10,%xmm2
pxor %xmm0,%xmm1
movdqu 112-128(%rdi),%xmm10
-.byte 102,65,15,56,221,219
+ aesenclast %xmm11,%xmm3
pxor %xmm0,%xmm10
movdqa 0(%rsp),%xmm11
-.byte 102,65,15,56,221,228
-.byte 102,65,15,56,221,237
+ aesenclast %xmm12,%xmm4
+ aesenclast %xmm13,%xmm5
movdqa 16(%rsp),%xmm12
movdqa 32(%rsp),%xmm13
-.byte 102,65,15,56,221,246
-.byte 102,65,15,56,221,255
+ aesenclast %xmm14,%xmm6
+ aesenclast %xmm15,%xmm7
movdqa 48(%rsp),%xmm14
movdqa 64(%rsp),%xmm15
-.byte 102,68,15,56,221,193
+ aesenclast %xmm1,%xmm8
movdqa 80(%rsp),%xmm0
movups 16-128(%rcx),%xmm1
-.byte 102,69,15,56,221,202
+ aesenclast %xmm10,%xmm9
movups %xmm2,(%rsi)
movdqa %xmm11,%xmm2
@@ -1230,19 +1230,19 @@
pxor %xmm9,%xmm9
movups 16(%rcx),%xmm0
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
+ aesenc %xmm1,%xmm2
+ aesenc %xmm1,%xmm3
leaq 32-16(%rcx,%rax,1),%rcx
negq %rax
-.byte 102,15,56,220,225
+ aesenc %xmm1,%xmm4
addq $16,%rax
movups (%rdi),%xmm10
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
+ aesenc %xmm1,%xmm5
+ aesenc %xmm1,%xmm6
movups 16(%rdi),%xmm11
movups 32(%rdi),%xmm12
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
+ aesenc %xmm1,%xmm7
+ aesenc %xmm1,%xmm8
call .Lenc_loop8_enter
@@ -1273,20 +1273,20 @@
.align 32
.Lctr32_loop4:
-.byte 102,15,56,220,209
+ aesenc %xmm1,%xmm2
leaq 16(%rcx),%rcx
decl %eax
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
+ aesenc %xmm1,%xmm3
+ aesenc %xmm1,%xmm4
+ aesenc %xmm1,%xmm5
movups (%rcx),%xmm1
jnz .Lctr32_loop4
-.byte 102,15,56,221,209
-.byte 102,15,56,221,217
+ aesenclast %xmm1,%xmm2
+ aesenclast %xmm1,%xmm3
movups (%rdi),%xmm10
movups 16(%rdi),%xmm11
-.byte 102,15,56,221,225
-.byte 102,15,56,221,233
+ aesenclast %xmm1,%xmm4
+ aesenclast %xmm1,%xmm5
movups 32(%rdi),%xmm12
movups 48(%rdi),%xmm13
@@ -1302,16 +1302,16 @@
.align 32
.Lctr32_loop3:
-.byte 102,15,56,220,209
+ aesenc %xmm1,%xmm2
leaq 16(%rcx),%rcx
decl %eax
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
+ aesenc %xmm1,%xmm3
+ aesenc %xmm1,%xmm4
movups (%rcx),%xmm1
jnz .Lctr32_loop3
-.byte 102,15,56,221,209
-.byte 102,15,56,221,217
-.byte 102,15,56,221,225
+ aesenclast %xmm1,%xmm2
+ aesenclast %xmm1,%xmm3
+ aesenclast %xmm1,%xmm4
movups (%rdi),%xmm10
xorps %xmm10,%xmm2
@@ -1394,12 +1394,12 @@
leaq 32(%rcx),%rcx
xorps %xmm3,%xmm2
.Loop_enc1_6:
-.byte 102,15,56,220,209
+ aesenc %xmm1,%xmm2
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz .Loop_enc1_6
-.byte 102,15,56,221,209
+ aesenclast %xmm1,%xmm2
movl %r10d,%eax
movq %r11,%rcx
movups %xmm2,0(%rsi)
@@ -1445,12 +1445,12 @@
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
.Loop_dec1_7:
-.byte 102,15,56,222,209
+ aesdec %xmm1,%xmm2
decl %r10d
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz .Loop_dec1_7
-.byte 102,15,56,223,209
+ aesdeclast %xmm1,%xmm2
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
movdqu %xmm4,(%r8)
@@ -1509,166 +1509,166 @@
pxor %xmm0,%xmm7
pxor %xmm0,%xmm8
-.byte 102,15,56,222,209
+ aesdec %xmm1,%xmm2
pxor %xmm0,%xmm9
movups 32-112(%rcx),%xmm0
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,68,15,56,222,193
+ aesdec %xmm1,%xmm3
+ aesdec %xmm1,%xmm4
+ aesdec %xmm1,%xmm5
+ aesdec %xmm1,%xmm6
+ aesdec %xmm1,%xmm7
+ aesdec %xmm1,%xmm8
adcq $0,%rbp
andq $128,%rbp
-.byte 102,68,15,56,222,201
+ aesdec %xmm1,%xmm9
addq %rdi,%rbp
movups 48-112(%rcx),%xmm1
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
-.byte 102,15,56,222,232
-.byte 102,15,56,222,240
-.byte 102,15,56,222,248
-.byte 102,68,15,56,222,192
-.byte 102,68,15,56,222,200
+ aesdec %xmm0,%xmm2
+ aesdec %xmm0,%xmm3
+ aesdec %xmm0,%xmm4
+ aesdec %xmm0,%xmm5
+ aesdec %xmm0,%xmm6
+ aesdec %xmm0,%xmm7
+ aesdec %xmm0,%xmm8
+ aesdec %xmm0,%xmm9
movups 64-112(%rcx),%xmm0
nop
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
+ aesdec %xmm1,%xmm4
+ aesdec %xmm1,%xmm5
+ aesdec %xmm1,%xmm6
+ aesdec %xmm1,%xmm7
+ aesdec %xmm1,%xmm8
+ aesdec %xmm1,%xmm9
movups 80-112(%rcx),%xmm1
nop
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
-.byte 102,15,56,222,232
-.byte 102,15,56,222,240
-.byte 102,15,56,222,248
-.byte 102,68,15,56,222,192
-.byte 102,68,15,56,222,200
+ aesdec %xmm0,%xmm2
+ aesdec %xmm0,%xmm3
+ aesdec %xmm0,%xmm4
+ aesdec %xmm0,%xmm5
+ aesdec %xmm0,%xmm6
+ aesdec %xmm0,%xmm7
+ aesdec %xmm0,%xmm8
+ aesdec %xmm0,%xmm9
movups 96-112(%rcx),%xmm0
nop
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
+ aesdec %xmm1,%xmm4
+ aesdec %xmm1,%xmm5
+ aesdec %xmm1,%xmm6
+ aesdec %xmm1,%xmm7
+ aesdec %xmm1,%xmm8
+ aesdec %xmm1,%xmm9
movups 112-112(%rcx),%xmm1
nop
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
-.byte 102,15,56,222,232
-.byte 102,15,56,222,240
-.byte 102,15,56,222,248
-.byte 102,68,15,56,222,192
-.byte 102,68,15,56,222,200
+ aesdec %xmm0,%xmm2
+ aesdec %xmm0,%xmm3
+ aesdec %xmm0,%xmm4
+ aesdec %xmm0,%xmm5
+ aesdec %xmm0,%xmm6
+ aesdec %xmm0,%xmm7
+ aesdec %xmm0,%xmm8
+ aesdec %xmm0,%xmm9
movups 128-112(%rcx),%xmm0
nop
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
+ aesdec %xmm1,%xmm4
+ aesdec %xmm1,%xmm5
+ aesdec %xmm1,%xmm6
+ aesdec %xmm1,%xmm7
+ aesdec %xmm1,%xmm8
+ aesdec %xmm1,%xmm9
movups 144-112(%rcx),%xmm1
cmpl $11,%eax
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
-.byte 102,15,56,222,232
-.byte 102,15,56,222,240
-.byte 102,15,56,222,248
-.byte 102,68,15,56,222,192
-.byte 102,68,15,56,222,200
+ aesdec %xmm0,%xmm2
+ aesdec %xmm0,%xmm3
+ aesdec %xmm0,%xmm4
+ aesdec %xmm0,%xmm5
+ aesdec %xmm0,%xmm6
+ aesdec %xmm0,%xmm7
+ aesdec %xmm0,%xmm8
+ aesdec %xmm0,%xmm9
movups 160-112(%rcx),%xmm0
jb .Lcbc_dec_done
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
+ aesdec %xmm1,%xmm4
+ aesdec %xmm1,%xmm5
+ aesdec %xmm1,%xmm6
+ aesdec %xmm1,%xmm7
+ aesdec %xmm1,%xmm8
+ aesdec %xmm1,%xmm9
movups 176-112(%rcx),%xmm1
nop
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
-.byte 102,15,56,222,232
-.byte 102,15,56,222,240
-.byte 102,15,56,222,248
-.byte 102,68,15,56,222,192
-.byte 102,68,15,56,222,200
+ aesdec %xmm0,%xmm2
+ aesdec %xmm0,%xmm3
+ aesdec %xmm0,%xmm4
+ aesdec %xmm0,%xmm5
+ aesdec %xmm0,%xmm6
+ aesdec %xmm0,%xmm7
+ aesdec %xmm0,%xmm8
+ aesdec %xmm0,%xmm9
movups 192-112(%rcx),%xmm0
je .Lcbc_dec_done
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
+ aesdec %xmm1,%xmm4
+ aesdec %xmm1,%xmm5
+ aesdec %xmm1,%xmm6
+ aesdec %xmm1,%xmm7
+ aesdec %xmm1,%xmm8
+ aesdec %xmm1,%xmm9
movups 208-112(%rcx),%xmm1
nop
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
-.byte 102,15,56,222,232
-.byte 102,15,56,222,240
-.byte 102,15,56,222,248
-.byte 102,68,15,56,222,192
-.byte 102,68,15,56,222,200
+ aesdec %xmm0,%xmm2
+ aesdec %xmm0,%xmm3
+ aesdec %xmm0,%xmm4
+ aesdec %xmm0,%xmm5
+ aesdec %xmm0,%xmm6
+ aesdec %xmm0,%xmm7
+ aesdec %xmm0,%xmm8
+ aesdec %xmm0,%xmm9
movups 224-112(%rcx),%xmm0
jmp .Lcbc_dec_done
.align 16
.Lcbc_dec_done:
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
+ aesdec %xmm1,%xmm2
+ aesdec %xmm1,%xmm3
pxor %xmm0,%xmm10
pxor %xmm0,%xmm11
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
+ aesdec %xmm1,%xmm4
+ aesdec %xmm1,%xmm5
pxor %xmm0,%xmm12
pxor %xmm0,%xmm13
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
+ aesdec %xmm1,%xmm6
+ aesdec %xmm1,%xmm7
pxor %xmm0,%xmm14
pxor %xmm0,%xmm15
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
+ aesdec %xmm1,%xmm8
+ aesdec %xmm1,%xmm9
movdqu 80(%rdi),%xmm1
-.byte 102,65,15,56,223,210
+ aesdeclast %xmm10,%xmm2
movdqu 96(%rdi),%xmm10
pxor %xmm0,%xmm1
-.byte 102,65,15,56,223,219
+ aesdeclast %xmm11,%xmm3
pxor %xmm0,%xmm10
movdqu 112(%rdi),%xmm0
-.byte 102,65,15,56,223,228
+ aesdeclast %xmm12,%xmm4
leaq 128(%rdi),%rdi
movdqu 0(%rbp),%xmm11
-.byte 102,65,15,56,223,237
-.byte 102,65,15,56,223,246
+ aesdeclast %xmm13,%xmm5
+ aesdeclast %xmm14,%xmm6
movdqu 16(%rbp),%xmm12
movdqu 32(%rbp),%xmm13
-.byte 102,65,15,56,223,255
-.byte 102,68,15,56,223,193
+ aesdeclast %xmm15,%xmm7
+ aesdeclast %xmm1,%xmm8
movdqu 48(%rbp),%xmm14
movdqu 64(%rbp),%xmm15
-.byte 102,69,15,56,223,202
+ aesdeclast %xmm10,%xmm9
movdqa %xmm0,%xmm10
movdqu 80(%rbp),%xmm1
movups -112(%rcx),%xmm0
@@ -1812,12 +1812,12 @@
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
.Loop_dec1_8:
-.byte 102,15,56,222,209
+ aesdec %xmm1,%xmm2
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz .Loop_dec1_8
-.byte 102,15,56,223,209
+ aesdeclast %xmm1,%xmm2
xorps %xmm10,%xmm2
movaps %xmm11,%xmm10
jmp .Lcbc_dec_tail_collected
@@ -1928,8 +1928,8 @@
.Ldec_key_inverse:
movups (%rdi),%xmm0
movups (%rdx),%xmm1
-.byte 102,15,56,219,192
-.byte 102,15,56,219,201
+ aesimc %xmm0,%xmm0
+ aesimc %xmm1,%xmm1
leaq 16(%rdi),%rdi
leaq -16(%rdx),%rdx
movups %xmm0,16(%rdx)
@@ -1938,7 +1938,7 @@
ja .Ldec_key_inverse
movups (%rdi),%xmm0
-.byte 102,15,56,219,192
+ aesimc %xmm0,%xmm0
pxor %xmm1,%xmm1
movups %xmm0,(%rdx)
pxor %xmm0,%xmm0
@@ -1974,25 +1974,25 @@
movl $9,%esi
movups %xmm0,(%rdx)
-.byte 102,15,58,223,200,1
+ aeskeygenassist $0x1,%xmm0,%xmm1
call .Lkey_expansion_128_cold
-.byte 102,15,58,223,200,2
+ aeskeygenassist $0x2,%xmm0,%xmm1
call .Lkey_expansion_128
-.byte 102,15,58,223,200,4
+ aeskeygenassist $0x4,%xmm0,%xmm1
call .Lkey_expansion_128
-.byte 102,15,58,223,200,8
+ aeskeygenassist $0x8,%xmm0,%xmm1
call .Lkey_expansion_128
-.byte 102,15,58,223,200,16
+ aeskeygenassist $0x10,%xmm0,%xmm1
call .Lkey_expansion_128
-.byte 102,15,58,223,200,32
+ aeskeygenassist $0x20,%xmm0,%xmm1
call .Lkey_expansion_128
-.byte 102,15,58,223,200,64
+ aeskeygenassist $0x40,%xmm0,%xmm1
call .Lkey_expansion_128
-.byte 102,15,58,223,200,128
+ aeskeygenassist $0x80,%xmm0,%xmm1
call .Lkey_expansion_128
-.byte 102,15,58,223,200,27
+ aeskeygenassist $0x1b,%xmm0,%xmm1
call .Lkey_expansion_128
-.byte 102,15,58,223,200,54
+ aeskeygenassist $0x36,%xmm0,%xmm1
call .Lkey_expansion_128
movups %xmm0,(%rax)
movl %esi,80(%rax)
@@ -2005,21 +2005,21 @@
movl $11,%esi
movups %xmm0,(%rdx)
-.byte 102,15,58,223,202,1
+ aeskeygenassist $0x1,%xmm2,%xmm1
call .Lkey_expansion_192a_cold
-.byte 102,15,58,223,202,2
+ aeskeygenassist $0x2,%xmm2,%xmm1
call .Lkey_expansion_192b
-.byte 102,15,58,223,202,4
+ aeskeygenassist $0x4,%xmm2,%xmm1
call .Lkey_expansion_192a
-.byte 102,15,58,223,202,8
+ aeskeygenassist $0x8,%xmm2,%xmm1
call .Lkey_expansion_192b
-.byte 102,15,58,223,202,16
+ aeskeygenassist $0x10,%xmm2,%xmm1
call .Lkey_expansion_192a
-.byte 102,15,58,223,202,32
+ aeskeygenassist $0x20,%xmm2,%xmm1
call .Lkey_expansion_192b
-.byte 102,15,58,223,202,64
+ aeskeygenassist $0x40,%xmm2,%xmm1
call .Lkey_expansion_192a
-.byte 102,15,58,223,202,128
+ aeskeygenassist $0x80,%xmm2,%xmm1
call .Lkey_expansion_192b
movups %xmm0,(%rax)
movl %esi,48(%rax)
@@ -2034,31 +2034,31 @@
movups %xmm0,(%rdx)
movups %xmm2,16(%rdx)
-.byte 102,15,58,223,202,1
+ aeskeygenassist $0x1,%xmm2,%xmm1
call .Lkey_expansion_256a_cold
-.byte 102,15,58,223,200,1
+ aeskeygenassist $0x1,%xmm0,%xmm1
call .Lkey_expansion_256b
-.byte 102,15,58,223,202,2
+ aeskeygenassist $0x2,%xmm2,%xmm1
call .Lkey_expansion_256a
-.byte 102,15,58,223,200,2
+ aeskeygenassist $0x2,%xmm0,%xmm1
call .Lkey_expansion_256b
-.byte 102,15,58,223,202,4
+ aeskeygenassist $0x4,%xmm2,%xmm1
call .Lkey_expansion_256a
-.byte 102,15,58,223,200,4
+ aeskeygenassist $0x4,%xmm0,%xmm1
call .Lkey_expansion_256b
-.byte 102,15,58,223,202,8
+ aeskeygenassist $0x8,%xmm2,%xmm1
call .Lkey_expansion_256a
-.byte 102,15,58,223,200,8
+ aeskeygenassist $0x8,%xmm0,%xmm1
call .Lkey_expansion_256b
-.byte 102,15,58,223,202,16
+ aeskeygenassist $0x10,%xmm2,%xmm1
call .Lkey_expansion_256a
-.byte 102,15,58,223,200,16
+ aeskeygenassist $0x10,%xmm0,%xmm1
call .Lkey_expansion_256b
-.byte 102,15,58,223,202,32
+ aeskeygenassist $0x20,%xmm2,%xmm1
call .Lkey_expansion_256a
-.byte 102,15,58,223,200,32
+ aeskeygenassist $0x20,%xmm0,%xmm1
call .Lkey_expansion_256b
-.byte 102,15,58,223,202,64
+ aeskeygenassist $0x40,%xmm2,%xmm1
call .Lkey_expansion_256a
movups %xmm0,(%rax)
movl %esi,16(%rax)
@@ -2196,8 +2196,8 @@
.align 16
.Loop_key128:
-.byte 102,15,56,0,197
-.byte 102,15,56,221,196
+ pshufb %xmm5,%xmm0
+ aesenclast %xmm4,%xmm0
pslld $1,%xmm4
leaq 16(%rax),%rax
@@ -2218,8 +2218,8 @@
movdqa .Lkey_rcon1b(%rip),%xmm4
-.byte 102,15,56,0,197
-.byte 102,15,56,221,196
+ pshufb %xmm5,%xmm0
+ aesenclast %xmm4,%xmm0
pslld $1,%xmm4
movdqa %xmm2,%xmm3
@@ -2234,8 +2234,8 @@
movdqu %xmm0,(%rax)
movdqa %xmm0,%xmm2
-.byte 102,15,56,0,197
-.byte 102,15,56,221,196
+ pshufb %xmm5,%xmm0
+ aesenclast %xmm4,%xmm0
movdqa %xmm2,%xmm3
pslldq $4,%xmm2
@@ -2266,8 +2266,8 @@
.Loop_key192:
movq %xmm2,0(%rax)
movdqa %xmm2,%xmm1
-.byte 102,15,56,0,213
-.byte 102,15,56,221,212
+ pshufb %xmm5,%xmm2
+ aesenclast %xmm4,%xmm2
pslld $1,%xmm4
leaq 24(%rax),%rax
@@ -2310,8 +2310,8 @@
.align 16
.Loop_key256:
-.byte 102,15,56,0,213
-.byte 102,15,56,221,212
+ pshufb %xmm5,%xmm2
+ aesenclast %xmm4,%xmm2
movdqa %xmm0,%xmm3
pslldq $4,%xmm0
@@ -2330,7 +2330,7 @@
pshufd $0xff,%xmm0,%xmm2
pxor %xmm3,%xmm3
-.byte 102,15,56,221,211
+ aesenclast %xmm3,%xmm2
movdqa %xmm1,%xmm3
pslldq $4,%xmm1
diff --git a/gen/bcm/aesni-x86_64-win.asm b/gen/bcm/aesni-x86_64-win.asm
index c585507..8e592cd 100644
--- a/gen/bcm/aesni-x86_64-win.asm
+++ b/gen/bcm/aesni-x86_64-win.asm
@@ -30,12 +30,12 @@
lea r8,[32+r8]
xorps xmm2,xmm0
$L$oop_enc1_1:
- DB 102,15,56,220,209
+ aesenc xmm2,xmm1
dec eax
movups xmm1,XMMWORD[r8]
lea r8,[16+r8]
jnz NEAR $L$oop_enc1_1
- DB 102,15,56,221,209
+ aesenclast xmm2,xmm1
pxor xmm0,xmm0
pxor xmm1,xmm1
movups XMMWORD[rdx],xmm2
@@ -57,12 +57,12 @@
lea r8,[32+r8]
xorps xmm2,xmm0
$L$oop_dec1_2:
- DB 102,15,56,222,209
+ aesdec xmm2,xmm1
dec eax
movups xmm1,XMMWORD[r8]
lea r8,[16+r8]
jnz NEAR $L$oop_dec1_2
- DB 102,15,56,223,209
+ aesdeclast xmm2,xmm1
pxor xmm0,xmm0
pxor xmm1,xmm1
movups XMMWORD[rdx],xmm2
@@ -85,19 +85,19 @@
add rax,16
$L$enc_loop2:
- DB 102,15,56,220,209
- DB 102,15,56,220,217
+ aesenc xmm2,xmm1
+ aesenc xmm3,xmm1
movups xmm1,XMMWORD[rax*1+rcx]
add rax,32
- DB 102,15,56,220,208
- DB 102,15,56,220,216
+ aesenc xmm2,xmm0
+ aesenc xmm3,xmm0
movups xmm0,XMMWORD[((-16))+rax*1+rcx]
jnz NEAR $L$enc_loop2
- DB 102,15,56,220,209
- DB 102,15,56,220,217
- DB 102,15,56,221,208
- DB 102,15,56,221,216
+ aesenc xmm2,xmm1
+ aesenc xmm3,xmm1
+ aesenclast xmm2,xmm0
+ aesenclast xmm3,xmm0
ret
@@ -116,19 +116,19 @@
add rax,16
$L$dec_loop2:
- DB 102,15,56,222,209
- DB 102,15,56,222,217
+ aesdec xmm2,xmm1
+ aesdec xmm3,xmm1
movups xmm1,XMMWORD[rax*1+rcx]
add rax,32
- DB 102,15,56,222,208
- DB 102,15,56,222,216
+ aesdec xmm2,xmm0
+ aesdec xmm3,xmm0
movups xmm0,XMMWORD[((-16))+rax*1+rcx]
jnz NEAR $L$dec_loop2
- DB 102,15,56,222,209
- DB 102,15,56,222,217
- DB 102,15,56,223,208
- DB 102,15,56,223,216
+ aesdec xmm2,xmm1
+ aesdec xmm3,xmm1
+ aesdeclast xmm2,xmm0
+ aesdeclast xmm3,xmm0
ret
@@ -148,23 +148,23 @@
add rax,16
$L$enc_loop3:
- DB 102,15,56,220,209
- DB 102,15,56,220,217
- DB 102,15,56,220,225
+ aesenc xmm2,xmm1
+ aesenc xmm3,xmm1
+ aesenc xmm4,xmm1
movups xmm1,XMMWORD[rax*1+rcx]
add rax,32
- DB 102,15,56,220,208
- DB 102,15,56,220,216
- DB 102,15,56,220,224
+ aesenc xmm2,xmm0
+ aesenc xmm3,xmm0
+ aesenc xmm4,xmm0
movups xmm0,XMMWORD[((-16))+rax*1+rcx]
jnz NEAR $L$enc_loop3
- DB 102,15,56,220,209
- DB 102,15,56,220,217
- DB 102,15,56,220,225
- DB 102,15,56,221,208
- DB 102,15,56,221,216
- DB 102,15,56,221,224
+ aesenc xmm2,xmm1
+ aesenc xmm3,xmm1
+ aesenc xmm4,xmm1
+ aesenclast xmm2,xmm0
+ aesenclast xmm3,xmm0
+ aesenclast xmm4,xmm0
ret
@@ -184,23 +184,23 @@
add rax,16
$L$dec_loop3:
- DB 102,15,56,222,209
- DB 102,15,56,222,217
- DB 102,15,56,222,225
+ aesdec xmm2,xmm1
+ aesdec xmm3,xmm1
+ aesdec xmm4,xmm1
movups xmm1,XMMWORD[rax*1+rcx]
add rax,32
- DB 102,15,56,222,208
- DB 102,15,56,222,216
- DB 102,15,56,222,224
+ aesdec xmm2,xmm0
+ aesdec xmm3,xmm0
+ aesdec xmm4,xmm0
movups xmm0,XMMWORD[((-16))+rax*1+rcx]
jnz NEAR $L$dec_loop3
- DB 102,15,56,222,209
- DB 102,15,56,222,217
- DB 102,15,56,222,225
- DB 102,15,56,223,208
- DB 102,15,56,223,216
- DB 102,15,56,223,224
+ aesdec xmm2,xmm1
+ aesdec xmm3,xmm1
+ aesdec xmm4,xmm1
+ aesdeclast xmm2,xmm0
+ aesdeclast xmm3,xmm0
+ aesdeclast xmm4,xmm0
ret
@@ -222,27 +222,27 @@
add rax,16
$L$enc_loop4:
- DB 102,15,56,220,209
- DB 102,15,56,220,217
- DB 102,15,56,220,225
- DB 102,15,56,220,233
+ aesenc xmm2,xmm1
+ aesenc xmm3,xmm1
+ aesenc xmm4,xmm1
+ aesenc xmm5,xmm1
movups xmm1,XMMWORD[rax*1+rcx]
add rax,32
- DB 102,15,56,220,208
- DB 102,15,56,220,216
- DB 102,15,56,220,224
- DB 102,15,56,220,232
+ aesenc xmm2,xmm0
+ aesenc xmm3,xmm0
+ aesenc xmm4,xmm0
+ aesenc xmm5,xmm0
movups xmm0,XMMWORD[((-16))+rax*1+rcx]
jnz NEAR $L$enc_loop4
- DB 102,15,56,220,209
- DB 102,15,56,220,217
- DB 102,15,56,220,225
- DB 102,15,56,220,233
- DB 102,15,56,221,208
- DB 102,15,56,221,216
- DB 102,15,56,221,224
- DB 102,15,56,221,232
+ aesenc xmm2,xmm1
+ aesenc xmm3,xmm1
+ aesenc xmm4,xmm1
+ aesenc xmm5,xmm1
+ aesenclast xmm2,xmm0
+ aesenclast xmm3,xmm0
+ aesenclast xmm4,xmm0
+ aesenclast xmm5,xmm0
ret
@@ -264,27 +264,27 @@
add rax,16
$L$dec_loop4:
- DB 102,15,56,222,209
- DB 102,15,56,222,217
- DB 102,15,56,222,225
- DB 102,15,56,222,233
+ aesdec xmm2,xmm1
+ aesdec xmm3,xmm1
+ aesdec xmm4,xmm1
+ aesdec xmm5,xmm1
movups xmm1,XMMWORD[rax*1+rcx]
add rax,32
- DB 102,15,56,222,208
- DB 102,15,56,222,216
- DB 102,15,56,222,224
- DB 102,15,56,222,232
+ aesdec xmm2,xmm0
+ aesdec xmm3,xmm0
+ aesdec xmm4,xmm0
+ aesdec xmm5,xmm0
movups xmm0,XMMWORD[((-16))+rax*1+rcx]
jnz NEAR $L$dec_loop4
- DB 102,15,56,222,209
- DB 102,15,56,222,217
- DB 102,15,56,222,225
- DB 102,15,56,222,233
- DB 102,15,56,223,208
- DB 102,15,56,223,216
- DB 102,15,56,223,224
- DB 102,15,56,223,232
+ aesdec xmm2,xmm1
+ aesdec xmm3,xmm1
+ aesdec xmm4,xmm1
+ aesdec xmm5,xmm1
+ aesdeclast xmm2,xmm0
+ aesdeclast xmm3,xmm0
+ aesdeclast xmm4,xmm0
+ aesdeclast xmm5,xmm0
ret
@@ -298,49 +298,49 @@
xorps xmm2,xmm0
pxor xmm3,xmm0
pxor xmm4,xmm0
- DB 102,15,56,220,209
+ aesenc xmm2,xmm1
lea rcx,[32+rax*1+rcx]
neg rax
- DB 102,15,56,220,217
+ aesenc xmm3,xmm1
pxor xmm5,xmm0
pxor xmm6,xmm0
- DB 102,15,56,220,225
+ aesenc xmm4,xmm1
pxor xmm7,xmm0
movups xmm0,XMMWORD[rax*1+rcx]
add rax,16
jmp NEAR $L$enc_loop6_enter
ALIGN 16
$L$enc_loop6:
- DB 102,15,56,220,209
- DB 102,15,56,220,217
- DB 102,15,56,220,225
+ aesenc xmm2,xmm1
+ aesenc xmm3,xmm1
+ aesenc xmm4,xmm1
$L$enc_loop6_enter:
- DB 102,15,56,220,233
- DB 102,15,56,220,241
- DB 102,15,56,220,249
+ aesenc xmm5,xmm1
+ aesenc xmm6,xmm1
+ aesenc xmm7,xmm1
movups xmm1,XMMWORD[rax*1+rcx]
add rax,32
- DB 102,15,56,220,208
- DB 102,15,56,220,216
- DB 102,15,56,220,224
- DB 102,15,56,220,232
- DB 102,15,56,220,240
- DB 102,15,56,220,248
+ aesenc xmm2,xmm0
+ aesenc xmm3,xmm0
+ aesenc xmm4,xmm0
+ aesenc xmm5,xmm0
+ aesenc xmm6,xmm0
+ aesenc xmm7,xmm0
movups xmm0,XMMWORD[((-16))+rax*1+rcx]
jnz NEAR $L$enc_loop6
- DB 102,15,56,220,209
- DB 102,15,56,220,217
- DB 102,15,56,220,225
- DB 102,15,56,220,233
- DB 102,15,56,220,241
- DB 102,15,56,220,249
- DB 102,15,56,221,208
- DB 102,15,56,221,216
- DB 102,15,56,221,224
- DB 102,15,56,221,232
- DB 102,15,56,221,240
- DB 102,15,56,221,248
+ aesenc xmm2,xmm1
+ aesenc xmm3,xmm1
+ aesenc xmm4,xmm1
+ aesenc xmm5,xmm1
+ aesenc xmm6,xmm1
+ aesenc xmm7,xmm1
+ aesenclast xmm2,xmm0
+ aesenclast xmm3,xmm0
+ aesenclast xmm4,xmm0
+ aesenclast xmm5,xmm0
+ aesenclast xmm6,xmm0
+ aesenclast xmm7,xmm0
ret
@@ -354,49 +354,49 @@
xorps xmm2,xmm0
pxor xmm3,xmm0
pxor xmm4,xmm0
- DB 102,15,56,222,209
+ aesdec xmm2,xmm1
lea rcx,[32+rax*1+rcx]
neg rax
- DB 102,15,56,222,217
+ aesdec xmm3,xmm1
pxor xmm5,xmm0
pxor xmm6,xmm0
- DB 102,15,56,222,225
+ aesdec xmm4,xmm1
pxor xmm7,xmm0
movups xmm0,XMMWORD[rax*1+rcx]
add rax,16
jmp NEAR $L$dec_loop6_enter
ALIGN 16
$L$dec_loop6:
- DB 102,15,56,222,209
- DB 102,15,56,222,217
- DB 102,15,56,222,225
+ aesdec xmm2,xmm1
+ aesdec xmm3,xmm1
+ aesdec xmm4,xmm1
$L$dec_loop6_enter:
- DB 102,15,56,222,233
- DB 102,15,56,222,241
- DB 102,15,56,222,249
+ aesdec xmm5,xmm1
+ aesdec xmm6,xmm1
+ aesdec xmm7,xmm1
movups xmm1,XMMWORD[rax*1+rcx]
add rax,32
- DB 102,15,56,222,208
- DB 102,15,56,222,216
- DB 102,15,56,222,224
- DB 102,15,56,222,232
- DB 102,15,56,222,240
- DB 102,15,56,222,248
+ aesdec xmm2,xmm0
+ aesdec xmm3,xmm0
+ aesdec xmm4,xmm0
+ aesdec xmm5,xmm0
+ aesdec xmm6,xmm0
+ aesdec xmm7,xmm0
movups xmm0,XMMWORD[((-16))+rax*1+rcx]
jnz NEAR $L$dec_loop6
- DB 102,15,56,222,209
- DB 102,15,56,222,217
- DB 102,15,56,222,225
- DB 102,15,56,222,233
- DB 102,15,56,222,241
- DB 102,15,56,222,249
- DB 102,15,56,223,208
- DB 102,15,56,223,216
- DB 102,15,56,223,224
- DB 102,15,56,223,232
- DB 102,15,56,223,240
- DB 102,15,56,223,248
+ aesdec xmm2,xmm1
+ aesdec xmm3,xmm1
+ aesdec xmm4,xmm1
+ aesdec xmm5,xmm1
+ aesdec xmm6,xmm1
+ aesdec xmm7,xmm1
+ aesdeclast xmm2,xmm0
+ aesdeclast xmm3,xmm0
+ aesdeclast xmm4,xmm0
+ aesdeclast xmm5,xmm0
+ aesdeclast xmm6,xmm0
+ aesdeclast xmm7,xmm0
ret
@@ -414,55 +414,55 @@
pxor xmm6,xmm0
lea rcx,[32+rax*1+rcx]
neg rax
- DB 102,15,56,220,209
+ aesenc xmm2,xmm1
pxor xmm7,xmm0
pxor xmm8,xmm0
- DB 102,15,56,220,217
+ aesenc xmm3,xmm1
pxor xmm9,xmm0
movups xmm0,XMMWORD[rax*1+rcx]
add rax,16
jmp NEAR $L$enc_loop8_inner
ALIGN 16
$L$enc_loop8:
- DB 102,15,56,220,209
- DB 102,15,56,220,217
+ aesenc xmm2,xmm1
+ aesenc xmm3,xmm1
$L$enc_loop8_inner:
- DB 102,15,56,220,225
- DB 102,15,56,220,233
- DB 102,15,56,220,241
- DB 102,15,56,220,249
- DB 102,68,15,56,220,193
- DB 102,68,15,56,220,201
+ aesenc xmm4,xmm1
+ aesenc xmm5,xmm1
+ aesenc xmm6,xmm1
+ aesenc xmm7,xmm1
+ aesenc xmm8,xmm1
+ aesenc xmm9,xmm1
$L$enc_loop8_enter:
movups xmm1,XMMWORD[rax*1+rcx]
add rax,32
- DB 102,15,56,220,208
- DB 102,15,56,220,216
- DB 102,15,56,220,224
- DB 102,15,56,220,232
- DB 102,15,56,220,240
- DB 102,15,56,220,248
- DB 102,68,15,56,220,192
- DB 102,68,15,56,220,200
+ aesenc xmm2,xmm0
+ aesenc xmm3,xmm0
+ aesenc xmm4,xmm0
+ aesenc xmm5,xmm0
+ aesenc xmm6,xmm0
+ aesenc xmm7,xmm0
+ aesenc xmm8,xmm0
+ aesenc xmm9,xmm0
movups xmm0,XMMWORD[((-16))+rax*1+rcx]
jnz NEAR $L$enc_loop8
- DB 102,15,56,220,209
- DB 102,15,56,220,217
- DB 102,15,56,220,225
- DB 102,15,56,220,233
- DB 102,15,56,220,241
- DB 102,15,56,220,249
- DB 102,68,15,56,220,193
- DB 102,68,15,56,220,201
- DB 102,15,56,221,208
- DB 102,15,56,221,216
- DB 102,15,56,221,224
- DB 102,15,56,221,232
- DB 102,15,56,221,240
- DB 102,15,56,221,248
- DB 102,68,15,56,221,192
- DB 102,68,15,56,221,200
+ aesenc xmm2,xmm1
+ aesenc xmm3,xmm1
+ aesenc xmm4,xmm1
+ aesenc xmm5,xmm1
+ aesenc xmm6,xmm1
+ aesenc xmm7,xmm1
+ aesenc xmm8,xmm1
+ aesenc xmm9,xmm1
+ aesenclast xmm2,xmm0
+ aesenclast xmm3,xmm0
+ aesenclast xmm4,xmm0
+ aesenclast xmm5,xmm0
+ aesenclast xmm6,xmm0
+ aesenclast xmm7,xmm0
+ aesenclast xmm8,xmm0
+ aesenclast xmm9,xmm0
ret
@@ -480,55 +480,55 @@
pxor xmm6,xmm0
lea rcx,[32+rax*1+rcx]
neg rax
- DB 102,15,56,222,209
+ aesdec xmm2,xmm1
pxor xmm7,xmm0
pxor xmm8,xmm0
- DB 102,15,56,222,217
+ aesdec xmm3,xmm1
pxor xmm9,xmm0
movups xmm0,XMMWORD[rax*1+rcx]
add rax,16
jmp NEAR $L$dec_loop8_inner
ALIGN 16
$L$dec_loop8:
- DB 102,15,56,222,209
- DB 102,15,56,222,217
+ aesdec xmm2,xmm1
+ aesdec xmm3,xmm1
$L$dec_loop8_inner:
- DB 102,15,56,222,225
- DB 102,15,56,222,233
- DB 102,15,56,222,241
- DB 102,15,56,222,249
- DB 102,68,15,56,222,193
- DB 102,68,15,56,222,201
+ aesdec xmm4,xmm1
+ aesdec xmm5,xmm1
+ aesdec xmm6,xmm1
+ aesdec xmm7,xmm1
+ aesdec xmm8,xmm1
+ aesdec xmm9,xmm1
$L$dec_loop8_enter:
movups xmm1,XMMWORD[rax*1+rcx]
add rax,32
- DB 102,15,56,222,208
- DB 102,15,56,222,216
- DB 102,15,56,222,224
- DB 102,15,56,222,232
- DB 102,15,56,222,240
- DB 102,15,56,222,248
- DB 102,68,15,56,222,192
- DB 102,68,15,56,222,200
+ aesdec xmm2,xmm0
+ aesdec xmm3,xmm0
+ aesdec xmm4,xmm0
+ aesdec xmm5,xmm0
+ aesdec xmm6,xmm0
+ aesdec xmm7,xmm0
+ aesdec xmm8,xmm0
+ aesdec xmm9,xmm0
movups xmm0,XMMWORD[((-16))+rax*1+rcx]
jnz NEAR $L$dec_loop8
- DB 102,15,56,222,209
- DB 102,15,56,222,217
- DB 102,15,56,222,225
- DB 102,15,56,222,233
- DB 102,15,56,222,241
- DB 102,15,56,222,249
- DB 102,68,15,56,222,193
- DB 102,68,15,56,222,201
- DB 102,15,56,223,208
- DB 102,15,56,223,216
- DB 102,15,56,223,224
- DB 102,15,56,223,232
- DB 102,15,56,223,240
- DB 102,15,56,223,248
- DB 102,68,15,56,223,192
- DB 102,68,15,56,223,200
+ aesdec xmm2,xmm1
+ aesdec xmm3,xmm1
+ aesdec xmm4,xmm1
+ aesdec xmm5,xmm1
+ aesdec xmm6,xmm1
+ aesdec xmm7,xmm1
+ aesdec xmm8,xmm1
+ aesdec xmm9,xmm1
+ aesdeclast xmm2,xmm0
+ aesdeclast xmm3,xmm0
+ aesdeclast xmm4,xmm0
+ aesdeclast xmm5,xmm0
+ aesdeclast xmm6,xmm0
+ aesdeclast xmm7,xmm0
+ aesdeclast xmm8,xmm0
+ aesdeclast xmm9,xmm0
ret
@@ -656,12 +656,12 @@
lea rcx,[32+rcx]
xorps xmm2,xmm0
$L$oop_enc1_3:
- DB 102,15,56,220,209
+ aesenc xmm2,xmm1
dec eax
movups xmm1,XMMWORD[rcx]
lea rcx,[16+rcx]
jnz NEAR $L$oop_enc1_3
- DB 102,15,56,221,209
+ aesenclast xmm2,xmm1
movups XMMWORD[rsi],xmm2
jmp NEAR $L$ecb_ret
ALIGN 16
@@ -817,12 +817,12 @@
lea rcx,[32+rcx]
xorps xmm2,xmm0
$L$oop_dec1_4:
- DB 102,15,56,222,209
+ aesdec xmm2,xmm1
dec eax
movups xmm1,XMMWORD[rcx]
lea rcx,[16+rcx]
jnz NEAR $L$oop_dec1_4
- DB 102,15,56,223,209
+ aesdeclast xmm2,xmm1
movups XMMWORD[rsi],xmm2
pxor xmm2,xmm2
jmp NEAR $L$ecb_ret
@@ -939,12 +939,12 @@
lea rcx,[32+rcx]
xorps xmm2,xmm0
$L$oop_enc1_5:
- DB 102,15,56,220,209
+ aesenc xmm2,xmm1
dec edx
movups xmm1,XMMWORD[rcx]
lea rcx,[16+rcx]
jnz NEAR $L$oop_enc1_5
- DB 102,15,56,221,209
+ aesenclast xmm2,xmm1
pxor xmm0,xmm0
pxor xmm1,xmm1
xorps xmm2,xmm3
@@ -998,17 +998,17 @@
bswap edx
xor eax,ebp
xor edx,ebp
-DB 102,15,58,34,216,3
+ pinsrd xmm3,eax,3
lea rax,[3+r8]
movdqa XMMWORD[16+rsp],xmm3
-DB 102,15,58,34,226,3
+ pinsrd xmm4,edx,3
bswap eax
mov rdx,r10
lea r10,[4+r8]
movdqa XMMWORD[32+rsp],xmm4
xor eax,ebp
bswap r10d
-DB 102,15,58,34,232,3
+ pinsrd xmm5,eax,3
xor r10d,ebp
movdqa XMMWORD[48+rsp],xmm5
lea r9,[5+r8]
@@ -1042,163 +1042,163 @@
$L$ctr32_loop8:
add r8d,8
movdqa xmm8,XMMWORD[96+rsp]
- DB 102,15,56,220,209
+ aesenc xmm2,xmm1
mov r9d,r8d
movdqa xmm9,XMMWORD[112+rsp]
- DB 102,15,56,220,217
+ aesenc xmm3,xmm1
bswap r9d
movups xmm0,XMMWORD[((32-128))+rcx]
- DB 102,15,56,220,225
+ aesenc xmm4,xmm1
xor r9d,ebp
nop
- DB 102,15,56,220,233
+ aesenc xmm5,xmm1
mov DWORD[((0+12))+rsp],r9d
lea r9,[1+r8]
- DB 102,15,56,220,241
- DB 102,15,56,220,249
- DB 102,68,15,56,220,193
- DB 102,68,15,56,220,201
+ aesenc xmm6,xmm1
+ aesenc xmm7,xmm1
+ aesenc xmm8,xmm1
+ aesenc xmm9,xmm1
movups xmm1,XMMWORD[((48-128))+rcx]
bswap r9d
- DB 102,15,56,220,208
- DB 102,15,56,220,216
+ aesenc xmm2,xmm0
+ aesenc xmm3,xmm0
xor r9d,ebp
DB 0x66,0x90
- DB 102,15,56,220,224
- DB 102,15,56,220,232
+ aesenc xmm4,xmm0
+ aesenc xmm5,xmm0
mov DWORD[((16+12))+rsp],r9d
lea r9,[2+r8]
- DB 102,15,56,220,240
- DB 102,15,56,220,248
- DB 102,68,15,56,220,192
- DB 102,68,15,56,220,200
+ aesenc xmm6,xmm0
+ aesenc xmm7,xmm0
+ aesenc xmm8,xmm0
+ aesenc xmm9,xmm0
movups xmm0,XMMWORD[((64-128))+rcx]
bswap r9d
- DB 102,15,56,220,209
- DB 102,15,56,220,217
+ aesenc xmm2,xmm1
+ aesenc xmm3,xmm1
xor r9d,ebp
DB 0x66,0x90
- DB 102,15,56,220,225
- DB 102,15,56,220,233
+ aesenc xmm4,xmm1
+ aesenc xmm5,xmm1
mov DWORD[((32+12))+rsp],r9d
lea r9,[3+r8]
- DB 102,15,56,220,241
- DB 102,15,56,220,249
- DB 102,68,15,56,220,193
- DB 102,68,15,56,220,201
+ aesenc xmm6,xmm1
+ aesenc xmm7,xmm1
+ aesenc xmm8,xmm1
+ aesenc xmm9,xmm1
movups xmm1,XMMWORD[((80-128))+rcx]
bswap r9d
- DB 102,15,56,220,208
- DB 102,15,56,220,216
+ aesenc xmm2,xmm0
+ aesenc xmm3,xmm0
xor r9d,ebp
DB 0x66,0x90
- DB 102,15,56,220,224
- DB 102,15,56,220,232
+ aesenc xmm4,xmm0
+ aesenc xmm5,xmm0
mov DWORD[((48+12))+rsp],r9d
lea r9,[4+r8]
- DB 102,15,56,220,240
- DB 102,15,56,220,248
- DB 102,68,15,56,220,192
- DB 102,68,15,56,220,200
+ aesenc xmm6,xmm0
+ aesenc xmm7,xmm0
+ aesenc xmm8,xmm0
+ aesenc xmm9,xmm0
movups xmm0,XMMWORD[((96-128))+rcx]
bswap r9d
- DB 102,15,56,220,209
- DB 102,15,56,220,217
+ aesenc xmm2,xmm1
+ aesenc xmm3,xmm1
xor r9d,ebp
DB 0x66,0x90
- DB 102,15,56,220,225
- DB 102,15,56,220,233
+ aesenc xmm4,xmm1
+ aesenc xmm5,xmm1
mov DWORD[((64+12))+rsp],r9d
lea r9,[5+r8]
- DB 102,15,56,220,241
- DB 102,15,56,220,249
- DB 102,68,15,56,220,193
- DB 102,68,15,56,220,201
+ aesenc xmm6,xmm1
+ aesenc xmm7,xmm1
+ aesenc xmm8,xmm1
+ aesenc xmm9,xmm1
movups xmm1,XMMWORD[((112-128))+rcx]
bswap r9d
- DB 102,15,56,220,208
- DB 102,15,56,220,216
+ aesenc xmm2,xmm0
+ aesenc xmm3,xmm0
xor r9d,ebp
DB 0x66,0x90
- DB 102,15,56,220,224
- DB 102,15,56,220,232
+ aesenc xmm4,xmm0
+ aesenc xmm5,xmm0
mov DWORD[((80+12))+rsp],r9d
lea r9,[6+r8]
- DB 102,15,56,220,240
- DB 102,15,56,220,248
- DB 102,68,15,56,220,192
- DB 102,68,15,56,220,200
+ aesenc xmm6,xmm0
+ aesenc xmm7,xmm0
+ aesenc xmm8,xmm0
+ aesenc xmm9,xmm0
movups xmm0,XMMWORD[((128-128))+rcx]
bswap r9d
- DB 102,15,56,220,209
- DB 102,15,56,220,217
+ aesenc xmm2,xmm1
+ aesenc xmm3,xmm1
xor r9d,ebp
DB 0x66,0x90
- DB 102,15,56,220,225
- DB 102,15,56,220,233
+ aesenc xmm4,xmm1
+ aesenc xmm5,xmm1
mov DWORD[((96+12))+rsp],r9d
lea r9,[7+r8]
- DB 102,15,56,220,241
- DB 102,15,56,220,249
- DB 102,68,15,56,220,193
- DB 102,68,15,56,220,201
+ aesenc xmm6,xmm1
+ aesenc xmm7,xmm1
+ aesenc xmm8,xmm1
+ aesenc xmm9,xmm1
movups xmm1,XMMWORD[((144-128))+rcx]
bswap r9d
- DB 102,15,56,220,208
- DB 102,15,56,220,216
- DB 102,15,56,220,224
+ aesenc xmm2,xmm0
+ aesenc xmm3,xmm0
+ aesenc xmm4,xmm0
xor r9d,ebp
movdqu xmm10,XMMWORD[rdi]
- DB 102,15,56,220,232
+ aesenc xmm5,xmm0
mov DWORD[((112+12))+rsp],r9d
cmp eax,11
- DB 102,15,56,220,240
- DB 102,15,56,220,248
- DB 102,68,15,56,220,192
- DB 102,68,15,56,220,200
+ aesenc xmm6,xmm0
+ aesenc xmm7,xmm0
+ aesenc xmm8,xmm0
+ aesenc xmm9,xmm0
movups xmm0,XMMWORD[((160-128))+rcx]
jb NEAR $L$ctr32_enc_done
- DB 102,15,56,220,209
- DB 102,15,56,220,217
- DB 102,15,56,220,225
- DB 102,15,56,220,233
- DB 102,15,56,220,241
- DB 102,15,56,220,249
- DB 102,68,15,56,220,193
- DB 102,68,15,56,220,201
+ aesenc xmm2,xmm1
+ aesenc xmm3,xmm1
+ aesenc xmm4,xmm1
+ aesenc xmm5,xmm1
+ aesenc xmm6,xmm1
+ aesenc xmm7,xmm1
+ aesenc xmm8,xmm1
+ aesenc xmm9,xmm1
movups xmm1,XMMWORD[((176-128))+rcx]
- DB 102,15,56,220,208
- DB 102,15,56,220,216
- DB 102,15,56,220,224
- DB 102,15,56,220,232
- DB 102,15,56,220,240
- DB 102,15,56,220,248
- DB 102,68,15,56,220,192
- DB 102,68,15,56,220,200
+ aesenc xmm2,xmm0
+ aesenc xmm3,xmm0
+ aesenc xmm4,xmm0
+ aesenc xmm5,xmm0
+ aesenc xmm6,xmm0
+ aesenc xmm7,xmm0
+ aesenc xmm8,xmm0
+ aesenc xmm9,xmm0
movups xmm0,XMMWORD[((192-128))+rcx]
je NEAR $L$ctr32_enc_done
- DB 102,15,56,220,209
- DB 102,15,56,220,217
- DB 102,15,56,220,225
- DB 102,15,56,220,233
- DB 102,15,56,220,241
- DB 102,15,56,220,249
- DB 102,68,15,56,220,193
- DB 102,68,15,56,220,201
+ aesenc xmm2,xmm1
+ aesenc xmm3,xmm1
+ aesenc xmm4,xmm1
+ aesenc xmm5,xmm1
+ aesenc xmm6,xmm1
+ aesenc xmm7,xmm1
+ aesenc xmm8,xmm1
+ aesenc xmm9,xmm1
movups xmm1,XMMWORD[((208-128))+rcx]
- DB 102,15,56,220,208
- DB 102,15,56,220,216
- DB 102,15,56,220,224
- DB 102,15,56,220,232
- DB 102,15,56,220,240
- DB 102,15,56,220,248
- DB 102,68,15,56,220,192
- DB 102,68,15,56,220,200
+ aesenc xmm2,xmm0
+ aesenc xmm3,xmm0
+ aesenc xmm4,xmm0
+ aesenc xmm5,xmm0
+ aesenc xmm6,xmm0
+ aesenc xmm7,xmm0
+ aesenc xmm8,xmm0
+ aesenc xmm9,xmm0
movups xmm0,XMMWORD[((224-128))+rcx]
jmp NEAR $L$ctr32_enc_done
@@ -1217,35 +1217,35 @@
prefetcht0 [448+rdi]
prefetcht0 [512+rdi]
pxor xmm15,xmm0
- DB 102,15,56,220,209
- DB 102,15,56,220,217
- DB 102,15,56,220,225
- DB 102,15,56,220,233
- DB 102,15,56,220,241
- DB 102,15,56,220,249
- DB 102,68,15,56,220,193
- DB 102,68,15,56,220,201
+ aesenc xmm2,xmm1
+ aesenc xmm3,xmm1
+ aesenc xmm4,xmm1
+ aesenc xmm5,xmm1
+ aesenc xmm6,xmm1
+ aesenc xmm7,xmm1
+ aesenc xmm8,xmm1
+ aesenc xmm9,xmm1
movdqu xmm1,XMMWORD[96+rdi]
lea rdi,[128+rdi]
- DB 102,65,15,56,221,210
+ aesenclast xmm2,xmm10
pxor xmm1,xmm0
movdqu xmm10,XMMWORD[((112-128))+rdi]
- DB 102,65,15,56,221,219
+ aesenclast xmm3,xmm11
pxor xmm10,xmm0
movdqa xmm11,XMMWORD[rsp]
- DB 102,65,15,56,221,228
- DB 102,65,15,56,221,237
+ aesenclast xmm4,xmm12
+ aesenclast xmm5,xmm13
movdqa xmm12,XMMWORD[16+rsp]
movdqa xmm13,XMMWORD[32+rsp]
- DB 102,65,15,56,221,246
- DB 102,65,15,56,221,255
+ aesenclast xmm6,xmm14
+ aesenclast xmm7,xmm15
movdqa xmm14,XMMWORD[48+rsp]
movdqa xmm15,XMMWORD[64+rsp]
- DB 102,68,15,56,221,193
+ aesenclast xmm8,xmm1
movdqa xmm0,XMMWORD[80+rsp]
movups xmm1,XMMWORD[((16-128))+rcx]
- DB 102,69,15,56,221,202
+ aesenclast xmm9,xmm10
movups XMMWORD[rsi],xmm2
movdqa xmm2,xmm11
@@ -1284,19 +1284,19 @@
pxor xmm9,xmm9
movups xmm0,XMMWORD[16+rcx]
- DB 102,15,56,220,209
- DB 102,15,56,220,217
+ aesenc xmm2,xmm1
+ aesenc xmm3,xmm1
lea rcx,[((32-16))+rax*1+rcx]
neg rax
- DB 102,15,56,220,225
+ aesenc xmm4,xmm1
add rax,16
movups xmm10,XMMWORD[rdi]
- DB 102,15,56,220,233
- DB 102,15,56,220,241
+ aesenc xmm5,xmm1
+ aesenc xmm6,xmm1
movups xmm11,XMMWORD[16+rdi]
movups xmm12,XMMWORD[32+rdi]
- DB 102,15,56,220,249
- DB 102,68,15,56,220,193
+ aesenc xmm7,xmm1
+ aesenc xmm8,xmm1
call $L$enc_loop8_enter
@@ -1327,20 +1327,20 @@
ALIGN 32
$L$ctr32_loop4:
- DB 102,15,56,220,209
+ aesenc xmm2,xmm1
lea rcx,[16+rcx]
dec eax
- DB 102,15,56,220,217
- DB 102,15,56,220,225
- DB 102,15,56,220,233
+ aesenc xmm3,xmm1
+ aesenc xmm4,xmm1
+ aesenc xmm5,xmm1
movups xmm1,XMMWORD[rcx]
jnz NEAR $L$ctr32_loop4
- DB 102,15,56,221,209
- DB 102,15,56,221,217
+ aesenclast xmm2,xmm1
+ aesenclast xmm3,xmm1
movups xmm10,XMMWORD[rdi]
movups xmm11,XMMWORD[16+rdi]
- DB 102,15,56,221,225
- DB 102,15,56,221,233
+ aesenclast xmm4,xmm1
+ aesenclast xmm5,xmm1
movups xmm12,XMMWORD[32+rdi]
movups xmm13,XMMWORD[48+rdi]
@@ -1356,16 +1356,16 @@
ALIGN 32
$L$ctr32_loop3:
- DB 102,15,56,220,209
+ aesenc xmm2,xmm1
lea rcx,[16+rcx]
dec eax
- DB 102,15,56,220,217
- DB 102,15,56,220,225
+ aesenc xmm3,xmm1
+ aesenc xmm4,xmm1
movups xmm1,XMMWORD[rcx]
jnz NEAR $L$ctr32_loop3
- DB 102,15,56,221,209
- DB 102,15,56,221,217
- DB 102,15,56,221,225
+ aesenclast xmm2,xmm1
+ aesenclast xmm3,xmm1
+ aesenclast xmm4,xmm1
movups xmm10,XMMWORD[rdi]
xorps xmm2,xmm10
@@ -1471,12 +1471,12 @@
lea rcx,[32+rcx]
xorps xmm2,xmm3
$L$oop_enc1_6:
- DB 102,15,56,220,209
+ aesenc xmm2,xmm1
dec eax
movups xmm1,XMMWORD[rcx]
lea rcx,[16+rcx]
jnz NEAR $L$oop_enc1_6
- DB 102,15,56,221,209
+ aesenclast xmm2,xmm1
mov eax,r10d
mov rcx,r11
movups XMMWORD[rsi],xmm2
@@ -1522,12 +1522,12 @@
lea rcx,[32+rcx]
xorps xmm2,xmm0
$L$oop_dec1_7:
- DB 102,15,56,222,209
+ aesdec xmm2,xmm1
dec r10d
movups xmm1,XMMWORD[rcx]
lea rcx,[16+rcx]
jnz NEAR $L$oop_dec1_7
- DB 102,15,56,223,209
+ aesdeclast xmm2,xmm1
pxor xmm0,xmm0
pxor xmm1,xmm1
movdqu XMMWORD[r8],xmm4
@@ -1597,166 +1597,166 @@
pxor xmm7,xmm0
pxor xmm8,xmm0
- DB 102,15,56,222,209
+ aesdec xmm2,xmm1
pxor xmm9,xmm0
movups xmm0,XMMWORD[((32-112))+rcx]
- DB 102,15,56,222,217
- DB 102,15,56,222,225
- DB 102,15,56,222,233
- DB 102,15,56,222,241
- DB 102,15,56,222,249
- DB 102,68,15,56,222,193
+ aesdec xmm3,xmm1
+ aesdec xmm4,xmm1
+ aesdec xmm5,xmm1
+ aesdec xmm6,xmm1
+ aesdec xmm7,xmm1
+ aesdec xmm8,xmm1
adc rbp,0
and rbp,128
- DB 102,68,15,56,222,201
+ aesdec xmm9,xmm1
add rbp,rdi
movups xmm1,XMMWORD[((48-112))+rcx]
- DB 102,15,56,222,208
- DB 102,15,56,222,216
- DB 102,15,56,222,224
- DB 102,15,56,222,232
- DB 102,15,56,222,240
- DB 102,15,56,222,248
- DB 102,68,15,56,222,192
- DB 102,68,15,56,222,200
+ aesdec xmm2,xmm0
+ aesdec xmm3,xmm0
+ aesdec xmm4,xmm0
+ aesdec xmm5,xmm0
+ aesdec xmm6,xmm0
+ aesdec xmm7,xmm0
+ aesdec xmm8,xmm0
+ aesdec xmm9,xmm0
movups xmm0,XMMWORD[((64-112))+rcx]
nop
- DB 102,15,56,222,209
- DB 102,15,56,222,217
- DB 102,15,56,222,225
- DB 102,15,56,222,233
- DB 102,15,56,222,241
- DB 102,15,56,222,249
- DB 102,68,15,56,222,193
- DB 102,68,15,56,222,201
+ aesdec xmm2,xmm1
+ aesdec xmm3,xmm1
+ aesdec xmm4,xmm1
+ aesdec xmm5,xmm1
+ aesdec xmm6,xmm1
+ aesdec xmm7,xmm1
+ aesdec xmm8,xmm1
+ aesdec xmm9,xmm1
movups xmm1,XMMWORD[((80-112))+rcx]
nop
- DB 102,15,56,222,208
- DB 102,15,56,222,216
- DB 102,15,56,222,224
- DB 102,15,56,222,232
- DB 102,15,56,222,240
- DB 102,15,56,222,248
- DB 102,68,15,56,222,192
- DB 102,68,15,56,222,200
+ aesdec xmm2,xmm0
+ aesdec xmm3,xmm0
+ aesdec xmm4,xmm0
+ aesdec xmm5,xmm0
+ aesdec xmm6,xmm0
+ aesdec xmm7,xmm0
+ aesdec xmm8,xmm0
+ aesdec xmm9,xmm0
movups xmm0,XMMWORD[((96-112))+rcx]
nop
- DB 102,15,56,222,209
- DB 102,15,56,222,217
- DB 102,15,56,222,225
- DB 102,15,56,222,233
- DB 102,15,56,222,241
- DB 102,15,56,222,249
- DB 102,68,15,56,222,193
- DB 102,68,15,56,222,201
+ aesdec xmm2,xmm1
+ aesdec xmm3,xmm1
+ aesdec xmm4,xmm1
+ aesdec xmm5,xmm1
+ aesdec xmm6,xmm1
+ aesdec xmm7,xmm1
+ aesdec xmm8,xmm1
+ aesdec xmm9,xmm1
movups xmm1,XMMWORD[((112-112))+rcx]
nop
- DB 102,15,56,222,208
- DB 102,15,56,222,216
- DB 102,15,56,222,224
- DB 102,15,56,222,232
- DB 102,15,56,222,240
- DB 102,15,56,222,248
- DB 102,68,15,56,222,192
- DB 102,68,15,56,222,200
+ aesdec xmm2,xmm0
+ aesdec xmm3,xmm0
+ aesdec xmm4,xmm0
+ aesdec xmm5,xmm0
+ aesdec xmm6,xmm0
+ aesdec xmm7,xmm0
+ aesdec xmm8,xmm0
+ aesdec xmm9,xmm0
movups xmm0,XMMWORD[((128-112))+rcx]
nop
- DB 102,15,56,222,209
- DB 102,15,56,222,217
- DB 102,15,56,222,225
- DB 102,15,56,222,233
- DB 102,15,56,222,241
- DB 102,15,56,222,249
- DB 102,68,15,56,222,193
- DB 102,68,15,56,222,201
+ aesdec xmm2,xmm1
+ aesdec xmm3,xmm1
+ aesdec xmm4,xmm1
+ aesdec xmm5,xmm1
+ aesdec xmm6,xmm1
+ aesdec xmm7,xmm1
+ aesdec xmm8,xmm1
+ aesdec xmm9,xmm1
movups xmm1,XMMWORD[((144-112))+rcx]
cmp eax,11
- DB 102,15,56,222,208
- DB 102,15,56,222,216
- DB 102,15,56,222,224
- DB 102,15,56,222,232
- DB 102,15,56,222,240
- DB 102,15,56,222,248
- DB 102,68,15,56,222,192
- DB 102,68,15,56,222,200
+ aesdec xmm2,xmm0
+ aesdec xmm3,xmm0
+ aesdec xmm4,xmm0
+ aesdec xmm5,xmm0
+ aesdec xmm6,xmm0
+ aesdec xmm7,xmm0
+ aesdec xmm8,xmm0
+ aesdec xmm9,xmm0
movups xmm0,XMMWORD[((160-112))+rcx]
jb NEAR $L$cbc_dec_done
- DB 102,15,56,222,209
- DB 102,15,56,222,217
- DB 102,15,56,222,225
- DB 102,15,56,222,233
- DB 102,15,56,222,241
- DB 102,15,56,222,249
- DB 102,68,15,56,222,193
- DB 102,68,15,56,222,201
+ aesdec xmm2,xmm1
+ aesdec xmm3,xmm1
+ aesdec xmm4,xmm1
+ aesdec xmm5,xmm1
+ aesdec xmm6,xmm1
+ aesdec xmm7,xmm1
+ aesdec xmm8,xmm1
+ aesdec xmm9,xmm1
movups xmm1,XMMWORD[((176-112))+rcx]
nop
- DB 102,15,56,222,208
- DB 102,15,56,222,216
- DB 102,15,56,222,224
- DB 102,15,56,222,232
- DB 102,15,56,222,240
- DB 102,15,56,222,248
- DB 102,68,15,56,222,192
- DB 102,68,15,56,222,200
+ aesdec xmm2,xmm0
+ aesdec xmm3,xmm0
+ aesdec xmm4,xmm0
+ aesdec xmm5,xmm0
+ aesdec xmm6,xmm0
+ aesdec xmm7,xmm0
+ aesdec xmm8,xmm0
+ aesdec xmm9,xmm0
movups xmm0,XMMWORD[((192-112))+rcx]
je NEAR $L$cbc_dec_done
- DB 102,15,56,222,209
- DB 102,15,56,222,217
- DB 102,15,56,222,225
- DB 102,15,56,222,233
- DB 102,15,56,222,241
- DB 102,15,56,222,249
- DB 102,68,15,56,222,193
- DB 102,68,15,56,222,201
+ aesdec xmm2,xmm1
+ aesdec xmm3,xmm1
+ aesdec xmm4,xmm1
+ aesdec xmm5,xmm1
+ aesdec xmm6,xmm1
+ aesdec xmm7,xmm1
+ aesdec xmm8,xmm1
+ aesdec xmm9,xmm1
movups xmm1,XMMWORD[((208-112))+rcx]
nop
- DB 102,15,56,222,208
- DB 102,15,56,222,216
- DB 102,15,56,222,224
- DB 102,15,56,222,232
- DB 102,15,56,222,240
- DB 102,15,56,222,248
- DB 102,68,15,56,222,192
- DB 102,68,15,56,222,200
+ aesdec xmm2,xmm0
+ aesdec xmm3,xmm0
+ aesdec xmm4,xmm0
+ aesdec xmm5,xmm0
+ aesdec xmm6,xmm0
+ aesdec xmm7,xmm0
+ aesdec xmm8,xmm0
+ aesdec xmm9,xmm0
movups xmm0,XMMWORD[((224-112))+rcx]
jmp NEAR $L$cbc_dec_done
ALIGN 16
$L$cbc_dec_done:
- DB 102,15,56,222,209
- DB 102,15,56,222,217
+ aesdec xmm2,xmm1
+ aesdec xmm3,xmm1
pxor xmm10,xmm0
pxor xmm11,xmm0
- DB 102,15,56,222,225
- DB 102,15,56,222,233
+ aesdec xmm4,xmm1
+ aesdec xmm5,xmm1
pxor xmm12,xmm0
pxor xmm13,xmm0
- DB 102,15,56,222,241
- DB 102,15,56,222,249
+ aesdec xmm6,xmm1
+ aesdec xmm7,xmm1
pxor xmm14,xmm0
pxor xmm15,xmm0
- DB 102,68,15,56,222,193
- DB 102,68,15,56,222,201
+ aesdec xmm8,xmm1
+ aesdec xmm9,xmm1
movdqu xmm1,XMMWORD[80+rdi]
- DB 102,65,15,56,223,210
+ aesdeclast xmm2,xmm10
movdqu xmm10,XMMWORD[96+rdi]
pxor xmm1,xmm0
- DB 102,65,15,56,223,219
+ aesdeclast xmm3,xmm11
pxor xmm10,xmm0
movdqu xmm0,XMMWORD[112+rdi]
- DB 102,65,15,56,223,228
+ aesdeclast xmm4,xmm12
lea rdi,[128+rdi]
movdqu xmm11,XMMWORD[rbp]
- DB 102,65,15,56,223,237
- DB 102,65,15,56,223,246
+ aesdeclast xmm5,xmm13
+ aesdeclast xmm6,xmm14
movdqu xmm12,XMMWORD[16+rbp]
movdqu xmm13,XMMWORD[32+rbp]
- DB 102,65,15,56,223,255
- DB 102,68,15,56,223,193
+ aesdeclast xmm7,xmm15
+ aesdeclast xmm8,xmm1
movdqu xmm14,XMMWORD[48+rbp]
movdqu xmm15,XMMWORD[64+rbp]
- DB 102,69,15,56,223,202
+ aesdeclast xmm9,xmm10
movdqa xmm10,xmm0
movdqu xmm1,XMMWORD[80+rbp]
movups xmm0,XMMWORD[((-112))+rcx]
@@ -1900,12 +1900,12 @@
lea rcx,[32+rcx]
xorps xmm2,xmm0
$L$oop_dec1_8:
- DB 102,15,56,222,209
+ aesdec xmm2,xmm1
dec eax
movups xmm1,XMMWORD[rcx]
lea rcx,[16+rcx]
jnz NEAR $L$oop_dec1_8
- DB 102,15,56,223,209
+ aesdeclast xmm2,xmm1
xorps xmm2,xmm10
movaps xmm10,xmm11
jmp NEAR $L$cbc_dec_tail_collected
@@ -2033,8 +2033,8 @@
$L$dec_key_inverse:
movups xmm0,XMMWORD[rcx]
movups xmm1,XMMWORD[r8]
- DB 102,15,56,219,192
- DB 102,15,56,219,201
+ aesimc xmm0,xmm0
+ aesimc xmm1,xmm1
lea rcx,[16+rcx]
lea r8,[((-16))+r8]
movups XMMWORD[16+r8],xmm0
@@ -2043,7 +2043,7 @@
ja NEAR $L$dec_key_inverse
movups xmm0,XMMWORD[rcx]
- DB 102,15,56,219,192
+ aesimc xmm0,xmm0
pxor xmm1,xmm1
movups XMMWORD[r8],xmm0
pxor xmm0,xmm0
@@ -2078,25 +2078,25 @@
mov edx,9
movups XMMWORD[r8],xmm0
- DB 102,15,58,223,200,1
+ aeskeygenassist xmm1,xmm0,0x1
call $L$key_expansion_128_cold
- DB 102,15,58,223,200,2
+ aeskeygenassist xmm1,xmm0,0x2
call $L$key_expansion_128
- DB 102,15,58,223,200,4
+ aeskeygenassist xmm1,xmm0,0x4
call $L$key_expansion_128
- DB 102,15,58,223,200,8
+ aeskeygenassist xmm1,xmm0,0x8
call $L$key_expansion_128
- DB 102,15,58,223,200,16
+ aeskeygenassist xmm1,xmm0,0x10
call $L$key_expansion_128
- DB 102,15,58,223,200,32
+ aeskeygenassist xmm1,xmm0,0x20
call $L$key_expansion_128
- DB 102,15,58,223,200,64
+ aeskeygenassist xmm1,xmm0,0x40
call $L$key_expansion_128
- DB 102,15,58,223,200,128
+ aeskeygenassist xmm1,xmm0,0x80
call $L$key_expansion_128
- DB 102,15,58,223,200,27
+ aeskeygenassist xmm1,xmm0,0x1b
call $L$key_expansion_128
- DB 102,15,58,223,200,54
+ aeskeygenassist xmm1,xmm0,0x36
call $L$key_expansion_128
movups XMMWORD[rax],xmm0
mov DWORD[80+rax],edx
@@ -2109,21 +2109,21 @@
mov edx,11
movups XMMWORD[r8],xmm0
- DB 102,15,58,223,202,1
+ aeskeygenassist xmm1,xmm2,0x1
call $L$key_expansion_192a_cold
- DB 102,15,58,223,202,2
+ aeskeygenassist xmm1,xmm2,0x2
call $L$key_expansion_192b
- DB 102,15,58,223,202,4
+ aeskeygenassist xmm1,xmm2,0x4
call $L$key_expansion_192a
- DB 102,15,58,223,202,8
+ aeskeygenassist xmm1,xmm2,0x8
call $L$key_expansion_192b
- DB 102,15,58,223,202,16
+ aeskeygenassist xmm1,xmm2,0x10
call $L$key_expansion_192a
- DB 102,15,58,223,202,32
+ aeskeygenassist xmm1,xmm2,0x20
call $L$key_expansion_192b
- DB 102,15,58,223,202,64
+ aeskeygenassist xmm1,xmm2,0x40
call $L$key_expansion_192a
- DB 102,15,58,223,202,128
+ aeskeygenassist xmm1,xmm2,0x80
call $L$key_expansion_192b
movups XMMWORD[rax],xmm0
mov DWORD[48+rax],edx
@@ -2138,31 +2138,31 @@
movups XMMWORD[r8],xmm0
movups XMMWORD[16+r8],xmm2
- DB 102,15,58,223,202,1
+ aeskeygenassist xmm1,xmm2,0x1
call $L$key_expansion_256a_cold
- DB 102,15,58,223,200,1
+ aeskeygenassist xmm1,xmm0,0x1
call $L$key_expansion_256b
- DB 102,15,58,223,202,2
+ aeskeygenassist xmm1,xmm2,0x2
call $L$key_expansion_256a
- DB 102,15,58,223,200,2
+ aeskeygenassist xmm1,xmm0,0x2
call $L$key_expansion_256b
- DB 102,15,58,223,202,4
+ aeskeygenassist xmm1,xmm2,0x4
call $L$key_expansion_256a
- DB 102,15,58,223,200,4
+ aeskeygenassist xmm1,xmm0,0x4
call $L$key_expansion_256b
- DB 102,15,58,223,202,8
+ aeskeygenassist xmm1,xmm2,0x8
call $L$key_expansion_256a
- DB 102,15,58,223,200,8
+ aeskeygenassist xmm1,xmm0,0x8
call $L$key_expansion_256b
- DB 102,15,58,223,202,16
+ aeskeygenassist xmm1,xmm2,0x10
call $L$key_expansion_256a
- DB 102,15,58,223,200,16
+ aeskeygenassist xmm1,xmm0,0x10
call $L$key_expansion_256b
- DB 102,15,58,223,202,32
+ aeskeygenassist xmm1,xmm2,0x20
call $L$key_expansion_256a
- DB 102,15,58,223,200,32
+ aeskeygenassist xmm1,xmm0,0x20
call $L$key_expansion_256b
- DB 102,15,58,223,202,64
+ aeskeygenassist xmm1,xmm2,0x40
call $L$key_expansion_256a
movups XMMWORD[rax],xmm0
mov DWORD[16+rax],edx
@@ -2299,8 +2299,8 @@
ALIGN 16
$L$oop_key128:
-DB 102,15,56,0,197
- DB 102,15,56,221,196
+ pshufb xmm0,xmm5
+ aesenclast xmm0,xmm4
pslld xmm4,1
lea rax,[16+rax]
@@ -2321,8 +2321,8 @@
movdqa xmm4,XMMWORD[$L$key_rcon1b]
-DB 102,15,56,0,197
- DB 102,15,56,221,196
+ pshufb xmm0,xmm5
+ aesenclast xmm0,xmm4
pslld xmm4,1
movdqa xmm3,xmm2
@@ -2337,8 +2337,8 @@
movdqu XMMWORD[rax],xmm0
movdqa xmm2,xmm0
-DB 102,15,56,0,197
- DB 102,15,56,221,196
+ pshufb xmm0,xmm5
+ aesenclast xmm0,xmm4
movdqa xmm3,xmm2
pslldq xmm2,4
@@ -2369,8 +2369,8 @@
$L$oop_key192:
movq QWORD[rax],xmm2
movdqa xmm1,xmm2
-DB 102,15,56,0,213
- DB 102,15,56,221,212
+ pshufb xmm2,xmm5
+ aesenclast xmm2,xmm4
pslld xmm4,1
lea rax,[24+rax]
@@ -2413,8 +2413,8 @@
ALIGN 16
$L$oop_key256:
-DB 102,15,56,0,213
- DB 102,15,56,221,212
+ pshufb xmm2,xmm5
+ aesenclast xmm2,xmm4
movdqa xmm3,xmm0
pslldq xmm0,4
@@ -2433,7 +2433,7 @@
pshufd xmm2,xmm0,0xff
pxor xmm3,xmm3
- DB 102,15,56,221,211
+ aesenclast xmm2,xmm3
movdqa xmm3,xmm1
pslldq xmm1,4
diff --git a/gen/bcm/ghash-ssse3-x86_64-apple.S b/gen/bcm/ghash-ssse3-x86_64-apple.S
index 651cca3..53af23f 100644
--- a/gen/bcm/ghash-ssse3-x86_64-apple.S
+++ b/gen/bcm/ghash-ssse3-x86_64-apple.S
@@ -23,7 +23,7 @@
movdqa L$low4_mask(%rip),%xmm2
-.byte 102,65,15,56,0,194
+ pshufb %xmm10,%xmm0
movdqa %xmm2,%xmm1
@@ -43,7 +43,7 @@
movdqa %xmm2,%xmm6
-.byte 102,15,58,15,243,1
+ palignr $1,%xmm3,%xmm6
movdqa %xmm6,%xmm3
psrldq $1,%xmm2
@@ -51,8 +51,8 @@
movdqa %xmm4,%xmm5
-.byte 102,15,56,0,224
-.byte 102,15,56,0,233
+ pshufb %xmm0,%xmm4
+ pshufb %xmm1,%xmm5
pxor %xmm5,%xmm2
@@ -91,7 +91,7 @@
movdqa %xmm2,%xmm6
-.byte 102,15,58,15,243,1
+ palignr $1,%xmm3,%xmm6
movdqa %xmm6,%xmm3
psrldq $1,%xmm2
@@ -99,8 +99,8 @@
movdqa %xmm4,%xmm5
-.byte 102,15,56,0,224
-.byte 102,15,56,0,233
+ pshufb %xmm0,%xmm4
+ pshufb %xmm1,%xmm5
pxor %xmm5,%xmm2
@@ -139,7 +139,7 @@
movdqa %xmm2,%xmm6
-.byte 102,15,58,15,243,1
+ palignr $1,%xmm3,%xmm6
movdqa %xmm6,%xmm3
psrldq $1,%xmm2
@@ -147,8 +147,8 @@
movdqa %xmm4,%xmm5
-.byte 102,15,56,0,224
-.byte 102,15,56,0,233
+ pshufb %xmm0,%xmm4
+ pshufb %xmm1,%xmm5
pxor %xmm5,%xmm2
@@ -181,7 +181,7 @@
pxor %xmm3,%xmm2
pxor %xmm3,%xmm3
-.byte 102,65,15,56,0,210
+ pshufb %xmm10,%xmm2
movdqu %xmm2,(%rdi)
@@ -218,14 +218,14 @@
-.byte 102,65,15,56,0,194
+ pshufb %xmm10,%xmm0
pxor %xmm3,%xmm3
L$oop_ghash:
movdqu (%rdx),%xmm1
-.byte 102,65,15,56,0,202
+ pshufb %xmm10,%xmm1
pxor %xmm1,%xmm0
@@ -246,7 +246,7 @@
movdqa %xmm2,%xmm6
-.byte 102,15,58,15,243,1
+ palignr $1,%xmm3,%xmm6
movdqa %xmm6,%xmm3
psrldq $1,%xmm2
@@ -254,8 +254,8 @@
movdqa %xmm4,%xmm5
-.byte 102,15,56,0,224
-.byte 102,15,56,0,233
+ pshufb %xmm0,%xmm4
+ pshufb %xmm1,%xmm5
pxor %xmm5,%xmm2
@@ -294,7 +294,7 @@
movdqa %xmm2,%xmm6
-.byte 102,15,58,15,243,1
+ palignr $1,%xmm3,%xmm6
movdqa %xmm6,%xmm3
psrldq $1,%xmm2
@@ -302,8 +302,8 @@
movdqa %xmm4,%xmm5
-.byte 102,15,56,0,224
-.byte 102,15,56,0,233
+ pshufb %xmm0,%xmm4
+ pshufb %xmm1,%xmm5
pxor %xmm5,%xmm2
@@ -342,7 +342,7 @@
movdqa %xmm2,%xmm6
-.byte 102,15,58,15,243,1
+ palignr $1,%xmm3,%xmm6
movdqa %xmm6,%xmm3
psrldq $1,%xmm2
@@ -350,8 +350,8 @@
movdqa %xmm4,%xmm5
-.byte 102,15,56,0,224
-.byte 102,15,56,0,233
+ pshufb %xmm0,%xmm4
+ pshufb %xmm1,%xmm5
pxor %xmm5,%xmm2
@@ -394,7 +394,7 @@
jnz L$oop_ghash
-.byte 102,65,15,56,0,194
+ pshufb %xmm10,%xmm0
movdqu %xmm0,(%rdi)
diff --git a/gen/bcm/ghash-ssse3-x86_64-linux.S b/gen/bcm/ghash-ssse3-x86_64-linux.S
index 84ac20a..edce38d 100644
--- a/gen/bcm/ghash-ssse3-x86_64-linux.S
+++ b/gen/bcm/ghash-ssse3-x86_64-linux.S
@@ -23,7 +23,7 @@
movdqa .Llow4_mask(%rip),%xmm2
-.byte 102,65,15,56,0,194
+ pshufb %xmm10,%xmm0
movdqa %xmm2,%xmm1
@@ -43,7 +43,7 @@
movdqa %xmm2,%xmm6
-.byte 102,15,58,15,243,1
+ palignr $1,%xmm3,%xmm6
movdqa %xmm6,%xmm3
psrldq $1,%xmm2
@@ -51,8 +51,8 @@
movdqa %xmm4,%xmm5
-.byte 102,15,56,0,224
-.byte 102,15,56,0,233
+ pshufb %xmm0,%xmm4
+ pshufb %xmm1,%xmm5
pxor %xmm5,%xmm2
@@ -91,7 +91,7 @@
movdqa %xmm2,%xmm6
-.byte 102,15,58,15,243,1
+ palignr $1,%xmm3,%xmm6
movdqa %xmm6,%xmm3
psrldq $1,%xmm2
@@ -99,8 +99,8 @@
movdqa %xmm4,%xmm5
-.byte 102,15,56,0,224
-.byte 102,15,56,0,233
+ pshufb %xmm0,%xmm4
+ pshufb %xmm1,%xmm5
pxor %xmm5,%xmm2
@@ -139,7 +139,7 @@
movdqa %xmm2,%xmm6
-.byte 102,15,58,15,243,1
+ palignr $1,%xmm3,%xmm6
movdqa %xmm6,%xmm3
psrldq $1,%xmm2
@@ -147,8 +147,8 @@
movdqa %xmm4,%xmm5
-.byte 102,15,56,0,224
-.byte 102,15,56,0,233
+ pshufb %xmm0,%xmm4
+ pshufb %xmm1,%xmm5
pxor %xmm5,%xmm2
@@ -181,7 +181,7 @@
pxor %xmm3,%xmm2
pxor %xmm3,%xmm3
-.byte 102,65,15,56,0,210
+ pshufb %xmm10,%xmm2
movdqu %xmm2,(%rdi)
@@ -218,14 +218,14 @@
-.byte 102,65,15,56,0,194
+ pshufb %xmm10,%xmm0
pxor %xmm3,%xmm3
.Loop_ghash:
movdqu (%rdx),%xmm1
-.byte 102,65,15,56,0,202
+ pshufb %xmm10,%xmm1
pxor %xmm1,%xmm0
@@ -246,7 +246,7 @@
movdqa %xmm2,%xmm6
-.byte 102,15,58,15,243,1
+ palignr $1,%xmm3,%xmm6
movdqa %xmm6,%xmm3
psrldq $1,%xmm2
@@ -254,8 +254,8 @@
movdqa %xmm4,%xmm5
-.byte 102,15,56,0,224
-.byte 102,15,56,0,233
+ pshufb %xmm0,%xmm4
+ pshufb %xmm1,%xmm5
pxor %xmm5,%xmm2
@@ -294,7 +294,7 @@
movdqa %xmm2,%xmm6
-.byte 102,15,58,15,243,1
+ palignr $1,%xmm3,%xmm6
movdqa %xmm6,%xmm3
psrldq $1,%xmm2
@@ -302,8 +302,8 @@
movdqa %xmm4,%xmm5
-.byte 102,15,56,0,224
-.byte 102,15,56,0,233
+ pshufb %xmm0,%xmm4
+ pshufb %xmm1,%xmm5
pxor %xmm5,%xmm2
@@ -342,7 +342,7 @@
movdqa %xmm2,%xmm6
-.byte 102,15,58,15,243,1
+ palignr $1,%xmm3,%xmm6
movdqa %xmm6,%xmm3
psrldq $1,%xmm2
@@ -350,8 +350,8 @@
movdqa %xmm4,%xmm5
-.byte 102,15,56,0,224
-.byte 102,15,56,0,233
+ pshufb %xmm0,%xmm4
+ pshufb %xmm1,%xmm5
pxor %xmm5,%xmm2
@@ -394,7 +394,7 @@
jnz .Loop_ghash
-.byte 102,65,15,56,0,194
+ pshufb %xmm10,%xmm0
movdqu %xmm0,(%rdi)
diff --git a/gen/bcm/ghash-ssse3-x86_64-win.asm b/gen/bcm/ghash-ssse3-x86_64-win.asm
index c00e039..5bcd094 100644
--- a/gen/bcm/ghash-ssse3-x86_64-win.asm
+++ b/gen/bcm/ghash-ssse3-x86_64-win.asm
@@ -37,7 +37,7 @@
movdqa xmm2,XMMWORD[$L$low4_mask]
-DB 102,65,15,56,0,194
+ pshufb xmm0,xmm10
movdqa xmm1,xmm2
@@ -57,7 +57,7 @@
movdqa xmm6,xmm2
-DB 102,15,58,15,243,1
+ palignr xmm6,xmm3,1
movdqa xmm3,xmm6
psrldq xmm2,1
@@ -65,8 +65,8 @@
movdqa xmm5,xmm4
-DB 102,15,56,0,224
-DB 102,15,56,0,233
+ pshufb xmm4,xmm0
+ pshufb xmm5,xmm1
pxor xmm2,xmm5
@@ -105,7 +105,7 @@
movdqa xmm6,xmm2
-DB 102,15,58,15,243,1
+ palignr xmm6,xmm3,1
movdqa xmm3,xmm6
psrldq xmm2,1
@@ -113,8 +113,8 @@
movdqa xmm5,xmm4
-DB 102,15,56,0,224
-DB 102,15,56,0,233
+ pshufb xmm4,xmm0
+ pshufb xmm5,xmm1
pxor xmm2,xmm5
@@ -153,7 +153,7 @@
movdqa xmm6,xmm2
-DB 102,15,58,15,243,1
+ palignr xmm6,xmm3,1
movdqa xmm3,xmm6
psrldq xmm2,1
@@ -161,8 +161,8 @@
movdqa xmm5,xmm4
-DB 102,15,56,0,224
-DB 102,15,56,0,233
+ pshufb xmm4,xmm0
+ pshufb xmm5,xmm1
pxor xmm2,xmm5
@@ -195,7 +195,7 @@
pxor xmm2,xmm3
pxor xmm3,xmm3
-DB 102,65,15,56,0,210
+ pshufb xmm2,xmm10
movdqu XMMWORD[rcx],xmm2
@@ -243,14 +243,14 @@
-DB 102,65,15,56,0,194
+ pshufb xmm0,xmm10
pxor xmm3,xmm3
$L$oop_ghash:
movdqu xmm1,XMMWORD[r8]
-DB 102,65,15,56,0,202
+ pshufb xmm1,xmm10
pxor xmm0,xmm1
@@ -271,7 +271,7 @@
movdqa xmm6,xmm2
-DB 102,15,58,15,243,1
+ palignr xmm6,xmm3,1
movdqa xmm3,xmm6
psrldq xmm2,1
@@ -279,8 +279,8 @@
movdqa xmm5,xmm4
-DB 102,15,56,0,224
-DB 102,15,56,0,233
+ pshufb xmm4,xmm0
+ pshufb xmm5,xmm1
pxor xmm2,xmm5
@@ -319,7 +319,7 @@
movdqa xmm6,xmm2
-DB 102,15,58,15,243,1
+ palignr xmm6,xmm3,1
movdqa xmm3,xmm6
psrldq xmm2,1
@@ -327,8 +327,8 @@
movdqa xmm5,xmm4
-DB 102,15,56,0,224
-DB 102,15,56,0,233
+ pshufb xmm4,xmm0
+ pshufb xmm5,xmm1
pxor xmm2,xmm5
@@ -367,7 +367,7 @@
movdqa xmm6,xmm2
-DB 102,15,58,15,243,1
+ palignr xmm6,xmm3,1
movdqa xmm3,xmm6
psrldq xmm2,1
@@ -375,8 +375,8 @@
movdqa xmm5,xmm4
-DB 102,15,56,0,224
-DB 102,15,56,0,233
+ pshufb xmm4,xmm0
+ pshufb xmm5,xmm1
pxor xmm2,xmm5
@@ -419,7 +419,7 @@
jnz NEAR $L$oop_ghash
-DB 102,65,15,56,0,194
+ pshufb xmm0,xmm10
movdqu XMMWORD[rcx],xmm0
diff --git a/gen/bcm/ghash-x86_64-apple.S b/gen/bcm/ghash-x86_64-apple.S
index 4961298..0cf60d1 100644
--- a/gen/bcm/ghash-x86_64-apple.S
+++ b/gen/bcm/ghash-x86_64-apple.S
@@ -38,9 +38,9 @@
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
pxor %xmm0,%xmm3
-.byte 102,15,58,68,194,0
-.byte 102,15,58,68,202,17
-.byte 102,15,58,68,222,0
+ pclmulqdq $0x00,%xmm2,%xmm0
+ pclmulqdq $0x11,%xmm2,%xmm1
+ pclmulqdq $0x00,%xmm6,%xmm3
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
@@ -78,14 +78,14 @@
movdqu %xmm2,0(%rdi)
pxor %xmm0,%xmm4
movdqu %xmm0,16(%rdi)
-.byte 102,15,58,15,227,8
+ palignr $8,%xmm3,%xmm4
movdqu %xmm4,32(%rdi)
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
pxor %xmm0,%xmm3
-.byte 102,15,58,68,194,0
-.byte 102,15,58,68,202,17
-.byte 102,15,58,68,222,0
+ pclmulqdq $0x00,%xmm2,%xmm0
+ pclmulqdq $0x11,%xmm2,%xmm1
+ pclmulqdq $0x00,%xmm6,%xmm3
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
@@ -121,9 +121,9 @@
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
pxor %xmm0,%xmm3
-.byte 102,15,58,68,194,0
-.byte 102,15,58,68,202,17
-.byte 102,15,58,68,222,0
+ pclmulqdq $0x00,%xmm2,%xmm0
+ pclmulqdq $0x11,%xmm2,%xmm1
+ pclmulqdq $0x00,%xmm6,%xmm3
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
@@ -161,7 +161,7 @@
movdqu %xmm5,48(%rdi)
pxor %xmm0,%xmm4
movdqu %xmm0,64(%rdi)
-.byte 102,15,58,15,227,8
+ palignr $8,%xmm3,%xmm4
movdqu %xmm4,80(%rdi)
ret
@@ -179,13 +179,13 @@
movdqa L$bswap_mask(%rip),%xmm5
movdqu (%rsi),%xmm2
movdqu 32(%rsi),%xmm4
-.byte 102,15,56,0,197
+ pshufb %xmm5,%xmm0
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
pxor %xmm0,%xmm3
-.byte 102,15,58,68,194,0
-.byte 102,15,58,68,202,17
-.byte 102,15,58,68,220,0
+ pclmulqdq $0x00,%xmm2,%xmm0
+ pclmulqdq $0x11,%xmm2,%xmm1
+ pclmulqdq $0x00,%xmm4,%xmm3
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
@@ -217,7 +217,7 @@
pxor %xmm4,%xmm0
psrlq $1,%xmm0
pxor %xmm1,%xmm0
-.byte 102,15,56,0,197
+ pshufb %xmm5,%xmm0
movdqu %xmm0,(%rdi)
ret
@@ -236,7 +236,7 @@
movdqu (%rdi),%xmm0
movdqu (%rsi),%xmm2
movdqu 32(%rsi),%xmm7
-.byte 102,65,15,56,0,194
+ pshufb %xmm10,%xmm0
subq $0x10,%rcx
jz L$odd_tail
@@ -255,21 +255,21 @@
movdqu 48(%rdx),%xmm3
movdqu 32(%rdx),%xmm11
-.byte 102,65,15,56,0,218
-.byte 102,69,15,56,0,218
+ pshufb %xmm10,%xmm3
+ pshufb %xmm10,%xmm11
movdqa %xmm3,%xmm5
pshufd $78,%xmm3,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,68,218,0
-.byte 102,15,58,68,234,17
-.byte 102,15,58,68,231,0
+ pclmulqdq $0x00,%xmm2,%xmm3
+ pclmulqdq $0x11,%xmm2,%xmm5
+ pclmulqdq $0x00,%xmm7,%xmm4
movdqa %xmm11,%xmm13
pshufd $78,%xmm11,%xmm12
pxor %xmm11,%xmm12
-.byte 102,68,15,58,68,222,0
-.byte 102,68,15,58,68,238,17
-.byte 102,68,15,58,68,231,16
+ pclmulqdq $0x00,%xmm6,%xmm11
+ pclmulqdq $0x11,%xmm6,%xmm13
+ pclmulqdq $0x10,%xmm7,%xmm12
xorps %xmm11,%xmm3
xorps %xmm13,%xmm5
movups 80(%rsi),%xmm7
@@ -277,18 +277,18 @@
movdqu 16(%rdx),%xmm11
movdqu 0(%rdx),%xmm8
-.byte 102,69,15,56,0,218
-.byte 102,69,15,56,0,194
+ pshufb %xmm10,%xmm11
+ pshufb %xmm10,%xmm8
movdqa %xmm11,%xmm13
pshufd $78,%xmm11,%xmm12
pxor %xmm8,%xmm0
pxor %xmm11,%xmm12
-.byte 102,69,15,58,68,222,0
+ pclmulqdq $0x00,%xmm14,%xmm11
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm8
pxor %xmm0,%xmm8
-.byte 102,69,15,58,68,238,17
-.byte 102,68,15,58,68,231,0
+ pclmulqdq $0x11,%xmm14,%xmm13
+ pclmulqdq $0x00,%xmm7,%xmm12
xorps %xmm11,%xmm3
xorps %xmm13,%xmm5
@@ -299,22 +299,22 @@
jmp L$mod4_loop
.p2align 5
L$mod4_loop:
-.byte 102,65,15,58,68,199,0
+ pclmulqdq $0x00,%xmm15,%xmm0
xorps %xmm12,%xmm4
movdqu 48(%rdx),%xmm11
-.byte 102,69,15,56,0,218
-.byte 102,65,15,58,68,207,17
+ pshufb %xmm10,%xmm11
+ pclmulqdq $0x11,%xmm15,%xmm1
xorps %xmm3,%xmm0
movdqu 32(%rdx),%xmm3
movdqa %xmm11,%xmm13
-.byte 102,68,15,58,68,199,16
+ pclmulqdq $0x10,%xmm7,%xmm8
pshufd $78,%xmm11,%xmm12
xorps %xmm5,%xmm1
pxor %xmm11,%xmm12
-.byte 102,65,15,56,0,218
+ pshufb %xmm10,%xmm3
movups 32(%rsi),%xmm7
xorps %xmm4,%xmm8
-.byte 102,68,15,58,68,218,0
+ pclmulqdq $0x00,%xmm2,%xmm11
pshufd $78,%xmm3,%xmm4
pxor %xmm0,%xmm8
@@ -322,22 +322,22 @@
pxor %xmm1,%xmm8
pxor %xmm3,%xmm4
movdqa %xmm8,%xmm9
-.byte 102,68,15,58,68,234,17
+ pclmulqdq $0x11,%xmm2,%xmm13
pslldq $8,%xmm8
psrldq $8,%xmm9
pxor %xmm8,%xmm0
movdqa L$7_mask(%rip),%xmm8
pxor %xmm9,%xmm1
-.byte 102,76,15,110,200
+ movq %rax,%xmm9
pand %xmm0,%xmm8
-.byte 102,69,15,56,0,200
+ pshufb %xmm8,%xmm9
pxor %xmm0,%xmm9
-.byte 102,68,15,58,68,231,0
+ pclmulqdq $0x00,%xmm7,%xmm12
psllq $57,%xmm9
movdqa %xmm9,%xmm8
pslldq $8,%xmm9
-.byte 102,15,58,68,222,0
+ pclmulqdq $0x00,%xmm6,%xmm3
psrldq $8,%xmm8
pxor %xmm9,%xmm0
pxor %xmm8,%xmm1
@@ -345,14 +345,14 @@
movdqa %xmm0,%xmm9
psrlq $1,%xmm0
-.byte 102,15,58,68,238,17
+ pclmulqdq $0x11,%xmm6,%xmm5
xorps %xmm11,%xmm3
movdqu 16(%rdx),%xmm11
-.byte 102,69,15,56,0,218
-.byte 102,15,58,68,231,16
+ pshufb %xmm10,%xmm11
+ pclmulqdq $0x10,%xmm7,%xmm4
xorps %xmm13,%xmm5
movups 80(%rsi),%xmm7
-.byte 102,69,15,56,0,194
+ pshufb %xmm10,%xmm8
pxor %xmm9,%xmm1
pxor %xmm0,%xmm9
psrlq $5,%xmm0
@@ -363,16 +363,16 @@
pxor %xmm9,%xmm0
pxor %xmm8,%xmm1
pxor %xmm11,%xmm12
-.byte 102,69,15,58,68,222,0
+ pclmulqdq $0x00,%xmm14,%xmm11
psrlq $1,%xmm0
pxor %xmm1,%xmm0
movdqa %xmm0,%xmm1
-.byte 102,69,15,58,68,238,17
+ pclmulqdq $0x11,%xmm14,%xmm13
xorps %xmm11,%xmm3
pshufd $78,%xmm0,%xmm8
pxor %xmm0,%xmm8
-.byte 102,68,15,58,68,231,0
+ pclmulqdq $0x00,%xmm7,%xmm12
xorps %xmm13,%xmm5
leaq 64(%rdx),%rdx
@@ -380,9 +380,9 @@
jnc L$mod4_loop
L$tail4x:
-.byte 102,65,15,58,68,199,0
-.byte 102,65,15,58,68,207,17
-.byte 102,68,15,58,68,199,16
+ pclmulqdq $0x00,%xmm15,%xmm0
+ pclmulqdq $0x11,%xmm15,%xmm1
+ pclmulqdq $0x10,%xmm7,%xmm8
xorps %xmm12,%xmm4
xorps %xmm3,%xmm0
xorps %xmm5,%xmm1
@@ -433,16 +433,16 @@
movdqu (%rdx),%xmm8
movdqu 16(%rdx),%xmm3
-.byte 102,69,15,56,0,194
-.byte 102,65,15,56,0,218
+ pshufb %xmm10,%xmm8
+ pshufb %xmm10,%xmm3
pxor %xmm8,%xmm0
movdqa %xmm3,%xmm5
pshufd $78,%xmm3,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,68,218,0
-.byte 102,15,58,68,234,17
-.byte 102,15,58,68,231,0
+ pclmulqdq $0x00,%xmm2,%xmm3
+ pclmulqdq $0x11,%xmm2,%xmm5
+ pclmulqdq $0x00,%xmm7,%xmm4
leaq 32(%rdx),%rdx
nop
@@ -458,21 +458,21 @@
pshufd $78,%xmm0,%xmm4
pxor %xmm0,%xmm4
-.byte 102,15,58,68,198,0
-.byte 102,15,58,68,206,17
-.byte 102,15,58,68,231,16
+ pclmulqdq $0x00,%xmm6,%xmm0
+ pclmulqdq $0x11,%xmm6,%xmm1
+ pclmulqdq $0x10,%xmm7,%xmm4
pxor %xmm3,%xmm0
pxor %xmm5,%xmm1
movdqu (%rdx),%xmm9
pxor %xmm0,%xmm8
-.byte 102,69,15,56,0,202
+ pshufb %xmm10,%xmm9
movdqu 16(%rdx),%xmm3
pxor %xmm1,%xmm8
pxor %xmm9,%xmm1
pxor %xmm8,%xmm4
-.byte 102,65,15,56,0,218
+ pshufb %xmm10,%xmm3
movdqa %xmm4,%xmm8
psrldq $8,%xmm8
pslldq $8,%xmm4
@@ -485,7 +485,7 @@
movdqa %xmm0,%xmm8
psllq $5,%xmm0
pxor %xmm0,%xmm8
-.byte 102,15,58,68,218,0
+ pclmulqdq $0x00,%xmm2,%xmm3
psllq $1,%xmm0
pxor %xmm8,%xmm0
psllq $57,%xmm0
@@ -499,14 +499,14 @@
movdqa %xmm0,%xmm9
psrlq $1,%xmm0
-.byte 102,15,58,68,234,17
+ pclmulqdq $0x11,%xmm2,%xmm5
pxor %xmm9,%xmm1
pxor %xmm0,%xmm9
psrlq $5,%xmm0
pxor %xmm9,%xmm0
leaq 32(%rdx),%rdx
psrlq $1,%xmm0
-.byte 102,15,58,68,231,0
+ pclmulqdq $0x00,%xmm7,%xmm4
pxor %xmm1,%xmm0
subq $0x20,%rcx
@@ -518,9 +518,9 @@
pshufd $78,%xmm0,%xmm4
pxor %xmm0,%xmm4
-.byte 102,15,58,68,198,0
-.byte 102,15,58,68,206,17
-.byte 102,15,58,68,231,16
+ pclmulqdq $0x00,%xmm6,%xmm0
+ pclmulqdq $0x11,%xmm6,%xmm1
+ pclmulqdq $0x10,%xmm7,%xmm4
pxor %xmm3,%xmm0
pxor %xmm5,%xmm1
@@ -560,14 +560,14 @@
L$odd_tail:
movdqu (%rdx),%xmm8
-.byte 102,69,15,56,0,194
+ pshufb %xmm10,%xmm8
pxor %xmm8,%xmm0
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
pxor %xmm0,%xmm3
-.byte 102,15,58,68,194,0
-.byte 102,15,58,68,202,17
-.byte 102,15,58,68,223,0
+ pclmulqdq $0x00,%xmm2,%xmm0
+ pclmulqdq $0x11,%xmm2,%xmm1
+ pclmulqdq $0x00,%xmm7,%xmm3
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
@@ -600,7 +600,7 @@
psrlq $1,%xmm0
pxor %xmm1,%xmm0
L$done:
-.byte 102,65,15,56,0,194
+ pshufb %xmm10,%xmm0
movdqu %xmm0,(%rdi)
ret
diff --git a/gen/bcm/ghash-x86_64-linux.S b/gen/bcm/ghash-x86_64-linux.S
index e00bb9f..f1ffcb8 100644
--- a/gen/bcm/ghash-x86_64-linux.S
+++ b/gen/bcm/ghash-x86_64-linux.S
@@ -38,9 +38,9 @@
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
pxor %xmm0,%xmm3
-.byte 102,15,58,68,194,0
-.byte 102,15,58,68,202,17
-.byte 102,15,58,68,222,0
+ pclmulqdq $0x00,%xmm2,%xmm0
+ pclmulqdq $0x11,%xmm2,%xmm1
+ pclmulqdq $0x00,%xmm6,%xmm3
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
@@ -78,14 +78,14 @@
movdqu %xmm2,0(%rdi)
pxor %xmm0,%xmm4
movdqu %xmm0,16(%rdi)
-.byte 102,15,58,15,227,8
+ palignr $8,%xmm3,%xmm4
movdqu %xmm4,32(%rdi)
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
pxor %xmm0,%xmm3
-.byte 102,15,58,68,194,0
-.byte 102,15,58,68,202,17
-.byte 102,15,58,68,222,0
+ pclmulqdq $0x00,%xmm2,%xmm0
+ pclmulqdq $0x11,%xmm2,%xmm1
+ pclmulqdq $0x00,%xmm6,%xmm3
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
@@ -121,9 +121,9 @@
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
pxor %xmm0,%xmm3
-.byte 102,15,58,68,194,0
-.byte 102,15,58,68,202,17
-.byte 102,15,58,68,222,0
+ pclmulqdq $0x00,%xmm2,%xmm0
+ pclmulqdq $0x11,%xmm2,%xmm1
+ pclmulqdq $0x00,%xmm6,%xmm3
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
@@ -161,7 +161,7 @@
movdqu %xmm5,48(%rdi)
pxor %xmm0,%xmm4
movdqu %xmm0,64(%rdi)
-.byte 102,15,58,15,227,8
+ palignr $8,%xmm3,%xmm4
movdqu %xmm4,80(%rdi)
ret
.cfi_endproc
@@ -179,13 +179,13 @@
movdqa .Lbswap_mask(%rip),%xmm5
movdqu (%rsi),%xmm2
movdqu 32(%rsi),%xmm4
-.byte 102,15,56,0,197
+ pshufb %xmm5,%xmm0
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
pxor %xmm0,%xmm3
-.byte 102,15,58,68,194,0
-.byte 102,15,58,68,202,17
-.byte 102,15,58,68,220,0
+ pclmulqdq $0x00,%xmm2,%xmm0
+ pclmulqdq $0x11,%xmm2,%xmm1
+ pclmulqdq $0x00,%xmm4,%xmm3
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
@@ -217,7 +217,7 @@
pxor %xmm4,%xmm0
psrlq $1,%xmm0
pxor %xmm1,%xmm0
-.byte 102,15,56,0,197
+ pshufb %xmm5,%xmm0
movdqu %xmm0,(%rdi)
ret
.cfi_endproc
@@ -236,7 +236,7 @@
movdqu (%rdi),%xmm0
movdqu (%rsi),%xmm2
movdqu 32(%rsi),%xmm7
-.byte 102,65,15,56,0,194
+ pshufb %xmm10,%xmm0
subq $0x10,%rcx
jz .Lodd_tail
@@ -255,21 +255,21 @@
movdqu 48(%rdx),%xmm3
movdqu 32(%rdx),%xmm11
-.byte 102,65,15,56,0,218
-.byte 102,69,15,56,0,218
+ pshufb %xmm10,%xmm3
+ pshufb %xmm10,%xmm11
movdqa %xmm3,%xmm5
pshufd $78,%xmm3,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,68,218,0
-.byte 102,15,58,68,234,17
-.byte 102,15,58,68,231,0
+ pclmulqdq $0x00,%xmm2,%xmm3
+ pclmulqdq $0x11,%xmm2,%xmm5
+ pclmulqdq $0x00,%xmm7,%xmm4
movdqa %xmm11,%xmm13
pshufd $78,%xmm11,%xmm12
pxor %xmm11,%xmm12
-.byte 102,68,15,58,68,222,0
-.byte 102,68,15,58,68,238,17
-.byte 102,68,15,58,68,231,16
+ pclmulqdq $0x00,%xmm6,%xmm11
+ pclmulqdq $0x11,%xmm6,%xmm13
+ pclmulqdq $0x10,%xmm7,%xmm12
xorps %xmm11,%xmm3
xorps %xmm13,%xmm5
movups 80(%rsi),%xmm7
@@ -277,18 +277,18 @@
movdqu 16(%rdx),%xmm11
movdqu 0(%rdx),%xmm8
-.byte 102,69,15,56,0,218
-.byte 102,69,15,56,0,194
+ pshufb %xmm10,%xmm11
+ pshufb %xmm10,%xmm8
movdqa %xmm11,%xmm13
pshufd $78,%xmm11,%xmm12
pxor %xmm8,%xmm0
pxor %xmm11,%xmm12
-.byte 102,69,15,58,68,222,0
+ pclmulqdq $0x00,%xmm14,%xmm11
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm8
pxor %xmm0,%xmm8
-.byte 102,69,15,58,68,238,17
-.byte 102,68,15,58,68,231,0
+ pclmulqdq $0x11,%xmm14,%xmm13
+ pclmulqdq $0x00,%xmm7,%xmm12
xorps %xmm11,%xmm3
xorps %xmm13,%xmm5
@@ -299,22 +299,22 @@
jmp .Lmod4_loop
.align 32
.Lmod4_loop:
-.byte 102,65,15,58,68,199,0
+ pclmulqdq $0x00,%xmm15,%xmm0
xorps %xmm12,%xmm4
movdqu 48(%rdx),%xmm11
-.byte 102,69,15,56,0,218
-.byte 102,65,15,58,68,207,17
+ pshufb %xmm10,%xmm11
+ pclmulqdq $0x11,%xmm15,%xmm1
xorps %xmm3,%xmm0
movdqu 32(%rdx),%xmm3
movdqa %xmm11,%xmm13
-.byte 102,68,15,58,68,199,16
+ pclmulqdq $0x10,%xmm7,%xmm8
pshufd $78,%xmm11,%xmm12
xorps %xmm5,%xmm1
pxor %xmm11,%xmm12
-.byte 102,65,15,56,0,218
+ pshufb %xmm10,%xmm3
movups 32(%rsi),%xmm7
xorps %xmm4,%xmm8
-.byte 102,68,15,58,68,218,0
+ pclmulqdq $0x00,%xmm2,%xmm11
pshufd $78,%xmm3,%xmm4
pxor %xmm0,%xmm8
@@ -322,22 +322,22 @@
pxor %xmm1,%xmm8
pxor %xmm3,%xmm4
movdqa %xmm8,%xmm9
-.byte 102,68,15,58,68,234,17
+ pclmulqdq $0x11,%xmm2,%xmm13
pslldq $8,%xmm8
psrldq $8,%xmm9
pxor %xmm8,%xmm0
movdqa .L7_mask(%rip),%xmm8
pxor %xmm9,%xmm1
-.byte 102,76,15,110,200
+ movq %rax,%xmm9
pand %xmm0,%xmm8
-.byte 102,69,15,56,0,200
+ pshufb %xmm8,%xmm9
pxor %xmm0,%xmm9
-.byte 102,68,15,58,68,231,0
+ pclmulqdq $0x00,%xmm7,%xmm12
psllq $57,%xmm9
movdqa %xmm9,%xmm8
pslldq $8,%xmm9
-.byte 102,15,58,68,222,0
+ pclmulqdq $0x00,%xmm6,%xmm3
psrldq $8,%xmm8
pxor %xmm9,%xmm0
pxor %xmm8,%xmm1
@@ -345,14 +345,14 @@
movdqa %xmm0,%xmm9
psrlq $1,%xmm0
-.byte 102,15,58,68,238,17
+ pclmulqdq $0x11,%xmm6,%xmm5
xorps %xmm11,%xmm3
movdqu 16(%rdx),%xmm11
-.byte 102,69,15,56,0,218
-.byte 102,15,58,68,231,16
+ pshufb %xmm10,%xmm11
+ pclmulqdq $0x10,%xmm7,%xmm4
xorps %xmm13,%xmm5
movups 80(%rsi),%xmm7
-.byte 102,69,15,56,0,194
+ pshufb %xmm10,%xmm8
pxor %xmm9,%xmm1
pxor %xmm0,%xmm9
psrlq $5,%xmm0
@@ -363,16 +363,16 @@
pxor %xmm9,%xmm0
pxor %xmm8,%xmm1
pxor %xmm11,%xmm12
-.byte 102,69,15,58,68,222,0
+ pclmulqdq $0x00,%xmm14,%xmm11
psrlq $1,%xmm0
pxor %xmm1,%xmm0
movdqa %xmm0,%xmm1
-.byte 102,69,15,58,68,238,17
+ pclmulqdq $0x11,%xmm14,%xmm13
xorps %xmm11,%xmm3
pshufd $78,%xmm0,%xmm8
pxor %xmm0,%xmm8
-.byte 102,68,15,58,68,231,0
+ pclmulqdq $0x00,%xmm7,%xmm12
xorps %xmm13,%xmm5
leaq 64(%rdx),%rdx
@@ -380,9 +380,9 @@
jnc .Lmod4_loop
.Ltail4x:
-.byte 102,65,15,58,68,199,0
-.byte 102,65,15,58,68,207,17
-.byte 102,68,15,58,68,199,16
+ pclmulqdq $0x00,%xmm15,%xmm0
+ pclmulqdq $0x11,%xmm15,%xmm1
+ pclmulqdq $0x10,%xmm7,%xmm8
xorps %xmm12,%xmm4
xorps %xmm3,%xmm0
xorps %xmm5,%xmm1
@@ -433,16 +433,16 @@
movdqu (%rdx),%xmm8
movdqu 16(%rdx),%xmm3
-.byte 102,69,15,56,0,194
-.byte 102,65,15,56,0,218
+ pshufb %xmm10,%xmm8
+ pshufb %xmm10,%xmm3
pxor %xmm8,%xmm0
movdqa %xmm3,%xmm5
pshufd $78,%xmm3,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,68,218,0
-.byte 102,15,58,68,234,17
-.byte 102,15,58,68,231,0
+ pclmulqdq $0x00,%xmm2,%xmm3
+ pclmulqdq $0x11,%xmm2,%xmm5
+ pclmulqdq $0x00,%xmm7,%xmm4
leaq 32(%rdx),%rdx
nop
@@ -458,21 +458,21 @@
pshufd $78,%xmm0,%xmm4
pxor %xmm0,%xmm4
-.byte 102,15,58,68,198,0
-.byte 102,15,58,68,206,17
-.byte 102,15,58,68,231,16
+ pclmulqdq $0x00,%xmm6,%xmm0
+ pclmulqdq $0x11,%xmm6,%xmm1
+ pclmulqdq $0x10,%xmm7,%xmm4
pxor %xmm3,%xmm0
pxor %xmm5,%xmm1
movdqu (%rdx),%xmm9
pxor %xmm0,%xmm8
-.byte 102,69,15,56,0,202
+ pshufb %xmm10,%xmm9
movdqu 16(%rdx),%xmm3
pxor %xmm1,%xmm8
pxor %xmm9,%xmm1
pxor %xmm8,%xmm4
-.byte 102,65,15,56,0,218
+ pshufb %xmm10,%xmm3
movdqa %xmm4,%xmm8
psrldq $8,%xmm8
pslldq $8,%xmm4
@@ -485,7 +485,7 @@
movdqa %xmm0,%xmm8
psllq $5,%xmm0
pxor %xmm0,%xmm8
-.byte 102,15,58,68,218,0
+ pclmulqdq $0x00,%xmm2,%xmm3
psllq $1,%xmm0
pxor %xmm8,%xmm0
psllq $57,%xmm0
@@ -499,14 +499,14 @@
movdqa %xmm0,%xmm9
psrlq $1,%xmm0
-.byte 102,15,58,68,234,17
+ pclmulqdq $0x11,%xmm2,%xmm5
pxor %xmm9,%xmm1
pxor %xmm0,%xmm9
psrlq $5,%xmm0
pxor %xmm9,%xmm0
leaq 32(%rdx),%rdx
psrlq $1,%xmm0
-.byte 102,15,58,68,231,0
+ pclmulqdq $0x00,%xmm7,%xmm4
pxor %xmm1,%xmm0
subq $0x20,%rcx
@@ -518,9 +518,9 @@
pshufd $78,%xmm0,%xmm4
pxor %xmm0,%xmm4
-.byte 102,15,58,68,198,0
-.byte 102,15,58,68,206,17
-.byte 102,15,58,68,231,16
+ pclmulqdq $0x00,%xmm6,%xmm0
+ pclmulqdq $0x11,%xmm6,%xmm1
+ pclmulqdq $0x10,%xmm7,%xmm4
pxor %xmm3,%xmm0
pxor %xmm5,%xmm1
@@ -560,14 +560,14 @@
.Lodd_tail:
movdqu (%rdx),%xmm8
-.byte 102,69,15,56,0,194
+ pshufb %xmm10,%xmm8
pxor %xmm8,%xmm0
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
pxor %xmm0,%xmm3
-.byte 102,15,58,68,194,0
-.byte 102,15,58,68,202,17
-.byte 102,15,58,68,223,0
+ pclmulqdq $0x00,%xmm2,%xmm0
+ pclmulqdq $0x11,%xmm2,%xmm1
+ pclmulqdq $0x00,%xmm7,%xmm3
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
@@ -600,7 +600,7 @@
psrlq $1,%xmm0
pxor %xmm1,%xmm0
.Ldone:
-.byte 102,65,15,56,0,194
+ pshufb %xmm10,%xmm0
movdqu %xmm0,(%rdi)
ret
.cfi_endproc
diff --git a/gen/bcm/ghash-x86_64-win.asm b/gen/bcm/ghash-x86_64-win.asm
index b5416b3..5cfb844 100644
--- a/gen/bcm/ghash-x86_64-win.asm
+++ b/gen/bcm/ghash-x86_64-win.asm
@@ -50,9 +50,9 @@
movdqa xmm1,xmm0
pshufd xmm3,xmm0,78
pxor xmm3,xmm0
-DB 102,15,58,68,194,0
-DB 102,15,58,68,202,17
-DB 102,15,58,68,222,0
+ pclmulqdq xmm0,xmm2,0x00
+ pclmulqdq xmm1,xmm2,0x11
+ pclmulqdq xmm3,xmm6,0x00
pxor xmm3,xmm0
pxor xmm3,xmm1
@@ -90,14 +90,14 @@
movdqu XMMWORD[rcx],xmm2
pxor xmm4,xmm0
movdqu XMMWORD[16+rcx],xmm0
-DB 102,15,58,15,227,8
+ palignr xmm4,xmm3,8
movdqu XMMWORD[32+rcx],xmm4
movdqa xmm1,xmm0
pshufd xmm3,xmm0,78
pxor xmm3,xmm0
-DB 102,15,58,68,194,0
-DB 102,15,58,68,202,17
-DB 102,15,58,68,222,0
+ pclmulqdq xmm0,xmm2,0x00
+ pclmulqdq xmm1,xmm2,0x11
+ pclmulqdq xmm3,xmm6,0x00
pxor xmm3,xmm0
pxor xmm3,xmm1
@@ -133,9 +133,9 @@
movdqa xmm1,xmm0
pshufd xmm3,xmm0,78
pxor xmm3,xmm0
-DB 102,15,58,68,194,0
-DB 102,15,58,68,202,17
-DB 102,15,58,68,222,0
+ pclmulqdq xmm0,xmm2,0x00
+ pclmulqdq xmm1,xmm2,0x11
+ pclmulqdq xmm3,xmm6,0x00
pxor xmm3,xmm0
pxor xmm3,xmm1
@@ -173,7 +173,7 @@
movdqu XMMWORD[48+rcx],xmm5
pxor xmm4,xmm0
movdqu XMMWORD[64+rcx],xmm0
-DB 102,15,58,15,227,8
+ palignr xmm4,xmm3,8
movdqu XMMWORD[80+rcx],xmm4
movaps xmm6,XMMWORD[rsp]
lea rsp,[24+rsp]
@@ -192,13 +192,13 @@
movdqa xmm5,XMMWORD[$L$bswap_mask]
movdqu xmm2,XMMWORD[rdx]
movdqu xmm4,XMMWORD[32+rdx]
-DB 102,15,56,0,197
+ pshufb xmm0,xmm5
movdqa xmm1,xmm0
pshufd xmm3,xmm0,78
pxor xmm3,xmm0
-DB 102,15,58,68,194,0
-DB 102,15,58,68,202,17
-DB 102,15,58,68,220,0
+ pclmulqdq xmm0,xmm2,0x00
+ pclmulqdq xmm1,xmm2,0x11
+ pclmulqdq xmm3,xmm4,0x00
pxor xmm3,xmm0
pxor xmm3,xmm1
@@ -230,7 +230,7 @@
pxor xmm0,xmm4
psrlq xmm0,1
pxor xmm0,xmm1
-DB 102,15,56,0,197
+ pshufb xmm0,xmm5
movdqu XMMWORD[rcx],xmm0
ret
@@ -272,7 +272,7 @@
movdqu xmm0,XMMWORD[rcx]
movdqu xmm2,XMMWORD[rdx]
movdqu xmm7,XMMWORD[32+rdx]
-DB 102,65,15,56,0,194
+ pshufb xmm0,xmm10
sub r9,0x10
jz NEAR $L$odd_tail
@@ -291,21 +291,21 @@
movdqu xmm3,XMMWORD[48+r8]
movdqu xmm11,XMMWORD[32+r8]
-DB 102,65,15,56,0,218
-DB 102,69,15,56,0,218
+ pshufb xmm3,xmm10
+ pshufb xmm11,xmm10
movdqa xmm5,xmm3
pshufd xmm4,xmm3,78
pxor xmm4,xmm3
-DB 102,15,58,68,218,0
-DB 102,15,58,68,234,17
-DB 102,15,58,68,231,0
+ pclmulqdq xmm3,xmm2,0x00
+ pclmulqdq xmm5,xmm2,0x11
+ pclmulqdq xmm4,xmm7,0x00
movdqa xmm13,xmm11
pshufd xmm12,xmm11,78
pxor xmm12,xmm11
-DB 102,68,15,58,68,222,0
-DB 102,68,15,58,68,238,17
-DB 102,68,15,58,68,231,16
+ pclmulqdq xmm11,xmm6,0x00
+ pclmulqdq xmm13,xmm6,0x11
+ pclmulqdq xmm12,xmm7,0x10
xorps xmm3,xmm11
xorps xmm5,xmm13
movups xmm7,XMMWORD[80+rdx]
@@ -313,18 +313,18 @@
movdqu xmm11,XMMWORD[16+r8]
movdqu xmm8,XMMWORD[r8]
-DB 102,69,15,56,0,218
-DB 102,69,15,56,0,194
+ pshufb xmm11,xmm10
+ pshufb xmm8,xmm10
movdqa xmm13,xmm11
pshufd xmm12,xmm11,78
pxor xmm0,xmm8
pxor xmm12,xmm11
-DB 102,69,15,58,68,222,0
+ pclmulqdq xmm11,xmm14,0x00
movdqa xmm1,xmm0
pshufd xmm8,xmm0,78
pxor xmm8,xmm0
-DB 102,69,15,58,68,238,17
-DB 102,68,15,58,68,231,0
+ pclmulqdq xmm13,xmm14,0x11
+ pclmulqdq xmm12,xmm7,0x00
xorps xmm3,xmm11
xorps xmm5,xmm13
@@ -335,22 +335,22 @@
jmp NEAR $L$mod4_loop
ALIGN 32
$L$mod4_loop:
-DB 102,65,15,58,68,199,0
+ pclmulqdq xmm0,xmm15,0x00
xorps xmm4,xmm12
movdqu xmm11,XMMWORD[48+r8]
-DB 102,69,15,56,0,218
-DB 102,65,15,58,68,207,17
+ pshufb xmm11,xmm10
+ pclmulqdq xmm1,xmm15,0x11
xorps xmm0,xmm3
movdqu xmm3,XMMWORD[32+r8]
movdqa xmm13,xmm11
-DB 102,68,15,58,68,199,16
+ pclmulqdq xmm8,xmm7,0x10
pshufd xmm12,xmm11,78
xorps xmm1,xmm5
pxor xmm12,xmm11
-DB 102,65,15,56,0,218
+ pshufb xmm3,xmm10
movups xmm7,XMMWORD[32+rdx]
xorps xmm8,xmm4
-DB 102,68,15,58,68,218,0
+ pclmulqdq xmm11,xmm2,0x00
pshufd xmm4,xmm3,78
pxor xmm8,xmm0
@@ -358,22 +358,22 @@
pxor xmm8,xmm1
pxor xmm4,xmm3
movdqa xmm9,xmm8
-DB 102,68,15,58,68,234,17
+ pclmulqdq xmm13,xmm2,0x11
pslldq xmm8,8
psrldq xmm9,8
pxor xmm0,xmm8
movdqa xmm8,XMMWORD[$L$7_mask]
pxor xmm1,xmm9
-DB 102,76,15,110,200
+ movq xmm9,rax
pand xmm8,xmm0
-DB 102,69,15,56,0,200
+ pshufb xmm9,xmm8
pxor xmm9,xmm0
-DB 102,68,15,58,68,231,0
+ pclmulqdq xmm12,xmm7,0x00
psllq xmm9,57
movdqa xmm8,xmm9
pslldq xmm9,8
-DB 102,15,58,68,222,0
+ pclmulqdq xmm3,xmm6,0x00
psrldq xmm8,8
pxor xmm0,xmm9
pxor xmm1,xmm8
@@ -381,14 +381,14 @@
movdqa xmm9,xmm0
psrlq xmm0,1
-DB 102,15,58,68,238,17
+ pclmulqdq xmm5,xmm6,0x11
xorps xmm3,xmm11
movdqu xmm11,XMMWORD[16+r8]
-DB 102,69,15,56,0,218
-DB 102,15,58,68,231,16
+ pshufb xmm11,xmm10
+ pclmulqdq xmm4,xmm7,0x10
xorps xmm5,xmm13
movups xmm7,XMMWORD[80+rdx]
-DB 102,69,15,56,0,194
+ pshufb xmm8,xmm10
pxor xmm1,xmm9
pxor xmm9,xmm0
psrlq xmm0,5
@@ -399,16 +399,16 @@
pxor xmm0,xmm9
pxor xmm1,xmm8
pxor xmm12,xmm11
-DB 102,69,15,58,68,222,0
+ pclmulqdq xmm11,xmm14,0x00
psrlq xmm0,1
pxor xmm0,xmm1
movdqa xmm1,xmm0
-DB 102,69,15,58,68,238,17
+ pclmulqdq xmm13,xmm14,0x11
xorps xmm3,xmm11
pshufd xmm8,xmm0,78
pxor xmm8,xmm0
-DB 102,68,15,58,68,231,0
+ pclmulqdq xmm12,xmm7,0x00
xorps xmm5,xmm13
lea r8,[64+r8]
@@ -416,9 +416,9 @@
jnc NEAR $L$mod4_loop
$L$tail4x:
-DB 102,65,15,58,68,199,0
-DB 102,65,15,58,68,207,17
-DB 102,68,15,58,68,199,16
+ pclmulqdq xmm0,xmm15,0x00
+ pclmulqdq xmm1,xmm15,0x11
+ pclmulqdq xmm8,xmm7,0x10
xorps xmm4,xmm12
xorps xmm0,xmm3
xorps xmm1,xmm5
@@ -469,16 +469,16 @@
movdqu xmm8,XMMWORD[r8]
movdqu xmm3,XMMWORD[16+r8]
-DB 102,69,15,56,0,194
-DB 102,65,15,56,0,218
+ pshufb xmm8,xmm10
+ pshufb xmm3,xmm10
pxor xmm0,xmm8
movdqa xmm5,xmm3
pshufd xmm4,xmm3,78
pxor xmm4,xmm3
-DB 102,15,58,68,218,0
-DB 102,15,58,68,234,17
-DB 102,15,58,68,231,0
+ pclmulqdq xmm3,xmm2,0x00
+ pclmulqdq xmm5,xmm2,0x11
+ pclmulqdq xmm4,xmm7,0x00
lea r8,[32+r8]
nop
@@ -494,21 +494,21 @@
pshufd xmm4,xmm0,78
pxor xmm4,xmm0
-DB 102,15,58,68,198,0
-DB 102,15,58,68,206,17
-DB 102,15,58,68,231,16
+ pclmulqdq xmm0,xmm6,0x00
+ pclmulqdq xmm1,xmm6,0x11
+ pclmulqdq xmm4,xmm7,0x10
pxor xmm0,xmm3
pxor xmm1,xmm5
movdqu xmm9,XMMWORD[r8]
pxor xmm8,xmm0
-DB 102,69,15,56,0,202
+ pshufb xmm9,xmm10
movdqu xmm3,XMMWORD[16+r8]
pxor xmm8,xmm1
pxor xmm1,xmm9
pxor xmm4,xmm8
-DB 102,65,15,56,0,218
+ pshufb xmm3,xmm10
movdqa xmm8,xmm4
psrldq xmm8,8
pslldq xmm4,8
@@ -521,7 +521,7 @@
movdqa xmm8,xmm0
psllq xmm0,5
pxor xmm8,xmm0
-DB 102,15,58,68,218,0
+ pclmulqdq xmm3,xmm2,0x00
psllq xmm0,1
pxor xmm0,xmm8
psllq xmm0,57
@@ -535,14 +535,14 @@
movdqa xmm9,xmm0
psrlq xmm0,1
-DB 102,15,58,68,234,17
+ pclmulqdq xmm5,xmm2,0x11
pxor xmm1,xmm9
pxor xmm9,xmm0
psrlq xmm0,5
pxor xmm0,xmm9
lea r8,[32+r8]
psrlq xmm0,1
-DB 102,15,58,68,231,0
+ pclmulqdq xmm4,xmm7,0x00
pxor xmm0,xmm1
sub r9,0x20
@@ -554,9 +554,9 @@
pshufd xmm4,xmm0,78
pxor xmm4,xmm0
-DB 102,15,58,68,198,0
-DB 102,15,58,68,206,17
-DB 102,15,58,68,231,16
+ pclmulqdq xmm0,xmm6,0x00
+ pclmulqdq xmm1,xmm6,0x11
+ pclmulqdq xmm4,xmm7,0x10
pxor xmm0,xmm3
pxor xmm1,xmm5
@@ -596,14 +596,14 @@
$L$odd_tail:
movdqu xmm8,XMMWORD[r8]
-DB 102,69,15,56,0,194
+ pshufb xmm8,xmm10
pxor xmm0,xmm8
movdqa xmm1,xmm0
pshufd xmm3,xmm0,78
pxor xmm3,xmm0
-DB 102,15,58,68,194,0
-DB 102,15,58,68,202,17
-DB 102,15,58,68,223,0
+ pclmulqdq xmm0,xmm2,0x00
+ pclmulqdq xmm1,xmm2,0x11
+ pclmulqdq xmm3,xmm7,0x00
pxor xmm3,xmm0
pxor xmm3,xmm1
@@ -636,7 +636,7 @@
psrlq xmm0,1
pxor xmm0,xmm1
$L$done:
-DB 102,65,15,56,0,194
+ pshufb xmm0,xmm10
movdqu XMMWORD[rcx],xmm0
movaps xmm6,XMMWORD[rsp]
movaps xmm7,XMMWORD[16+rsp]
diff --git a/gen/bcm/p256-x86_64-asm-apple.S b/gen/bcm/p256-x86_64-asm-apple.S
index d43fcfc..80ffa01 100644
--- a/gen/bcm/p256-x86_64-asm-apple.S
+++ b/gen/bcm/p256-x86_64-asm-apple.S
@@ -450,21 +450,21 @@
movq %rax,%rbp
mulq %r8
movq %rax,%r9
-.byte 102,72,15,110,205
+ movq %rbp,%xmm1
movq %r14,%rax
movq %rdx,%r10
mulq %r8
addq %rax,%r10
movq %r15,%rax
-.byte 102,73,15,110,214
+ movq %r14,%xmm2
adcq $0,%rdx
movq %rdx,%r11
mulq %r8
addq %rax,%r11
movq %r15,%rax
-.byte 102,73,15,110,223
+ movq %r15,%xmm3
adcq $0,%rdx
movq %rdx,%r12
@@ -503,20 +503,20 @@
mulq %rax
movq %rax,%r8
-.byte 102,72,15,126,200
+ movq %xmm1,%rax
movq %rdx,%rbp
mulq %rax
addq %rbp,%r9
adcq %rax,%r10
-.byte 102,72,15,126,208
+ movq %xmm2,%rax
adcq $0,%rdx
movq %rdx,%rbp
mulq %rax
addq %rbp,%r11
adcq %rax,%r12
-.byte 102,72,15,126,216
+ movq %xmm3,%rax
adcq $0,%rdx
movq %rdx,%rbp
@@ -977,11 +977,11 @@
mulxq %r14,%r9,%r10
mulxq %r15,%rcx,%r11
movq %rdx,%rax
-.byte 102,73,15,110,206
+ movq %r14,%xmm1
mulxq %r8,%rbp,%r12
movq %r14,%rdx
addq %rcx,%r10
-.byte 102,73,15,110,215
+ movq %r15,%xmm2
adcq %rbp,%r11
adcq $0,%r12
xorq %r13,%r13
@@ -998,7 +998,7 @@
mulxq %r8,%rcx,%r14
movq %rax,%rdx
-.byte 102,73,15,110,216
+ movq %r8,%xmm3
xorq %r15,%r15
adcxq %r9,%r9
adoxq %rcx,%r13
@@ -1007,18 +1007,18 @@
mulxq %rdx,%r8,%rbp
-.byte 102,72,15,126,202
+ movq %xmm1,%rdx
adcxq %r11,%r11
adoxq %rbp,%r9
adcxq %r12,%r12
mulxq %rdx,%rcx,%rax
-.byte 102,72,15,126,210
+ movq %xmm2,%rdx
adcxq %r13,%r13
adoxq %rcx,%r10
adcxq %r14,%r14
mulxq %rdx,%rcx,%rbp
.byte 0x67
-.byte 102,72,15,126,218
+ movq %xmm3,%rdx
adoxq %rax,%r11
adcxq %r15,%r15
adoxq %rcx,%r12
@@ -2462,9 +2462,9 @@
movdqa %xmm1,96+16(%rsp)
leaq 32(%rdi),%r10
leaq 64(%rdi),%r11
-.byte 102,72,15,110,199
-.byte 102,73,15,110,202
-.byte 102,73,15,110,211
+ movq %rdi,%xmm0
+ movq %r10,%xmm1
+ movq %r11,%xmm2
leaq 0(%rsp),%rdi
call __ecp_nistz256_mul_by_2q
@@ -2492,7 +2492,7 @@
movq 64+24(%rbx),%r12
leaq 64-0(%rbx),%rsi
leaq 32(%rbx),%rbx
-.byte 102,72,15,126,215
+ movq %xmm2,%rdi
call __ecp_nistz256_mul_montq
call __ecp_nistz256_mul_by_2q
@@ -2517,7 +2517,7 @@
leaq 0+0(%rsp),%rsi
movq 16+0(%rsp),%r15
movq 24+0(%rsp),%r8
-.byte 102,72,15,126,207
+ movq %xmm1,%rdi
call __ecp_nistz256_sqr_montq
xorq %r9,%r9
movq %r12,%rax
@@ -2592,7 +2592,7 @@
leaq 0+32(%rsp),%rsi
movq 16+32(%rsp),%r15
movq 24+32(%rsp),%r8
-.byte 102,72,15,126,199
+ movq %xmm0,%rdi
call __ecp_nistz256_sqr_montq
leaq 128(%rsp),%rbx
@@ -2625,8 +2625,8 @@
leaq 0(%rsp),%rdi
call __ecp_nistz256_mul_montq
-.byte 102,72,15,126,203
-.byte 102,72,15,126,207
+ movq %xmm1,%rbx
+ movq %xmm1,%rdi
call __ecp_nistz256_sub_fromq
leaq 160+56(%rsp),%rsi
@@ -2708,7 +2708,7 @@
por %xmm4,%xmm5
pxor %xmm4,%xmm4
por %xmm0,%xmm1
-.byte 102,72,15,110,199
+ movq %rdi,%xmm0
leaq 64-0(%rsi),%rsi
movq %rax,544+0(%rsp)
@@ -2731,7 +2731,7 @@
movq 64+8(%rbx),%r14
movq 64+16(%rbx),%r15
movq 64+24(%rbx),%r8
-.byte 102,72,15,110,203
+ movq %rbx,%xmm1
leaq 64-0(%rbx),%rsi
leaq 32(%rsp),%rdi
@@ -2786,7 +2786,7 @@
orq %r8,%r12
orq %r9,%r12
por %xmm5,%xmm2
-.byte 102,73,15,110,220
+ movq %r12,%xmm3
movq 384(%rsp),%rax
leaq 384(%rsp),%rbx
@@ -2816,8 +2816,8 @@
orq %r8,%r12
orq %r9,%r12
-.byte 102,73,15,126,208
-.byte 102,73,15,126,217
+ movq %xmm2,%r8
+ movq %xmm3,%r9
orq %r8,%r12
.byte 0x3e
jnz L$add_proceedq
@@ -2832,7 +2832,7 @@
-.byte 102,72,15,126,199
+ movq %xmm0,%rdi
pxor %xmm0,%xmm0
movdqu %xmm0,0(%rdi)
movdqu %xmm0,16(%rdi)
@@ -2844,8 +2844,8 @@
.p2align 5
L$add_doubleq:
-.byte 102,72,15,126,206
-.byte 102,72,15,126,199
+ movq %xmm1,%rsi
+ movq %xmm0,%rdi
addq $416,%rsp
jmp L$point_double_shortcutq
@@ -2981,7 +2981,7 @@
leaq 320(%rsp),%rdi
call __ecp_nistz256_sub_fromq
-.byte 102,72,15,126,199
+ movq %xmm0,%rdi
movdqa %xmm5,%xmm0
movdqa %xmm5,%xmm1
@@ -3128,7 +3128,7 @@
pshufd $0x1e,%xmm5,%xmm4
movdqa %xmm1,416+16(%rsp)
por %xmm0,%xmm1
-.byte 102,72,15,110,199
+ movq %rdi,%xmm0
movdqa %xmm2,448(%rsp)
movdqa %xmm3,448+16(%rsp)
por %xmm2,%xmm3
@@ -3306,7 +3306,7 @@
leaq 256(%rsp),%rdi
call __ecp_nistz256_sub_fromq
-.byte 102,72,15,126,199
+ movq %xmm0,%rdi
movdqa %xmm5,%xmm0
movdqa %xmm5,%xmm1
@@ -3572,9 +3572,9 @@
movdqa %xmm1,96+16(%rsp)
leaq 32(%rdi),%r10
leaq 64(%rdi),%r11
-.byte 102,72,15,110,199
-.byte 102,73,15,110,202
-.byte 102,73,15,110,211
+ movq %rdi,%xmm0
+ movq %r10,%xmm1
+ movq %r11,%xmm2
leaq 0(%rsp),%rdi
call __ecp_nistz256_mul_by_2x
@@ -3602,7 +3602,7 @@
movq 64+24(%rbx),%r12
leaq 64-128(%rbx),%rsi
leaq 32(%rbx),%rbx
-.byte 102,72,15,126,215
+ movq %xmm2,%rdi
call __ecp_nistz256_mul_montx
call __ecp_nistz256_mul_by_2x
@@ -3627,7 +3627,7 @@
leaq -128+0(%rsp),%rsi
movq 16+0(%rsp),%r15
movq 24+0(%rsp),%r8
-.byte 102,72,15,126,207
+ movq %xmm1,%rdi
call __ecp_nistz256_sqr_montx
xorq %r9,%r9
movq %r12,%rax
@@ -3702,7 +3702,7 @@
leaq -128+32(%rsp),%rsi
movq 16+32(%rsp),%r15
movq 24+32(%rsp),%r8
-.byte 102,72,15,126,199
+ movq %xmm0,%rdi
call __ecp_nistz256_sqr_montx
leaq 128(%rsp),%rbx
@@ -3735,8 +3735,8 @@
leaq 0(%rsp),%rdi
call __ecp_nistz256_mul_montx
-.byte 102,72,15,126,203
-.byte 102,72,15,126,207
+ movq %xmm1,%rbx
+ movq %xmm1,%rdi
call __ecp_nistz256_sub_fromx
leaq 160+56(%rsp),%rsi
@@ -3818,7 +3818,7 @@
por %xmm4,%xmm5
pxor %xmm4,%xmm4
por %xmm0,%xmm1
-.byte 102,72,15,110,199
+ movq %rdi,%xmm0
leaq 64-128(%rsi),%rsi
movq %rdx,544+0(%rsp)
@@ -3841,7 +3841,7 @@
movq 64+8(%rbx),%r14
movq 64+16(%rbx),%r15
movq 64+24(%rbx),%r8
-.byte 102,72,15,110,203
+ movq %rbx,%xmm1
leaq 64-128(%rbx),%rsi
leaq 32(%rsp),%rdi
@@ -3896,7 +3896,7 @@
orq %r8,%r12
orq %r9,%r12
por %xmm5,%xmm2
-.byte 102,73,15,110,220
+ movq %r12,%xmm3
movq 384(%rsp),%rdx
leaq 384(%rsp),%rbx
@@ -3926,8 +3926,8 @@
orq %r8,%r12
orq %r9,%r12
-.byte 102,73,15,126,208
-.byte 102,73,15,126,217
+ movq %xmm2,%r8
+ movq %xmm3,%r9
orq %r8,%r12
.byte 0x3e
jnz L$add_proceedx
@@ -3942,7 +3942,7 @@
-.byte 102,72,15,126,199
+ movq %xmm0,%rdi
pxor %xmm0,%xmm0
movdqu %xmm0,0(%rdi)
movdqu %xmm0,16(%rdi)
@@ -3954,8 +3954,8 @@
.p2align 5
L$add_doublex:
-.byte 102,72,15,126,206
-.byte 102,72,15,126,199
+ movq %xmm1,%rsi
+ movq %xmm0,%rdi
addq $416,%rsp
jmp L$point_double_shortcutx
@@ -4091,7 +4091,7 @@
leaq 320(%rsp),%rdi
call __ecp_nistz256_sub_fromx
-.byte 102,72,15,126,199
+ movq %xmm0,%rdi
movdqa %xmm5,%xmm0
movdqa %xmm5,%xmm1
@@ -4238,7 +4238,7 @@
pshufd $0x1e,%xmm5,%xmm4
movdqa %xmm1,416+16(%rsp)
por %xmm0,%xmm1
-.byte 102,72,15,110,199
+ movq %rdi,%xmm0
movdqa %xmm2,448(%rsp)
movdqa %xmm3,448+16(%rsp)
por %xmm2,%xmm3
@@ -4416,7 +4416,7 @@
leaq 256(%rsp),%rdi
call __ecp_nistz256_sub_fromx
-.byte 102,72,15,126,199
+ movq %xmm0,%rdi
movdqa %xmm5,%xmm0
movdqa %xmm5,%xmm1
diff --git a/gen/bcm/p256-x86_64-asm-linux.S b/gen/bcm/p256-x86_64-asm-linux.S
index b14ef80..c54d577 100644
--- a/gen/bcm/p256-x86_64-asm-linux.S
+++ b/gen/bcm/p256-x86_64-asm-linux.S
@@ -464,21 +464,21 @@
movq %rax,%rbp
mulq %r8
movq %rax,%r9
-.byte 102,72,15,110,205
+ movq %rbp,%xmm1
movq %r14,%rax
movq %rdx,%r10
mulq %r8
addq %rax,%r10
movq %r15,%rax
-.byte 102,73,15,110,214
+ movq %r14,%xmm2
adcq $0,%rdx
movq %rdx,%r11
mulq %r8
addq %rax,%r11
movq %r15,%rax
-.byte 102,73,15,110,223
+ movq %r15,%xmm3
adcq $0,%rdx
movq %rdx,%r12
@@ -517,20 +517,20 @@
mulq %rax
movq %rax,%r8
-.byte 102,72,15,126,200
+ movq %xmm1,%rax
movq %rdx,%rbp
mulq %rax
addq %rbp,%r9
adcq %rax,%r10
-.byte 102,72,15,126,208
+ movq %xmm2,%rax
adcq $0,%rdx
movq %rdx,%rbp
mulq %rax
addq %rbp,%r11
adcq %rax,%r12
-.byte 102,72,15,126,216
+ movq %xmm3,%rax
adcq $0,%rdx
movq %rdx,%rbp
@@ -1003,11 +1003,11 @@
mulxq %r14,%r9,%r10
mulxq %r15,%rcx,%r11
movq %rdx,%rax
-.byte 102,73,15,110,206
+ movq %r14,%xmm1
mulxq %r8,%rbp,%r12
movq %r14,%rdx
addq %rcx,%r10
-.byte 102,73,15,110,215
+ movq %r15,%xmm2
adcq %rbp,%r11
adcq $0,%r12
xorq %r13,%r13
@@ -1024,7 +1024,7 @@
mulxq %r8,%rcx,%r14
movq %rax,%rdx
-.byte 102,73,15,110,216
+ movq %r8,%xmm3
xorq %r15,%r15
adcxq %r9,%r9
adoxq %rcx,%r13
@@ -1033,18 +1033,18 @@
mulxq %rdx,%r8,%rbp
-.byte 102,72,15,126,202
+ movq %xmm1,%rdx
adcxq %r11,%r11
adoxq %rbp,%r9
adcxq %r12,%r12
mulxq %rdx,%rcx,%rax
-.byte 102,72,15,126,210
+ movq %xmm2,%rdx
adcxq %r13,%r13
adoxq %rcx,%r10
adcxq %r14,%r14
mulxq %rdx,%rcx,%rbp
.byte 0x67
-.byte 102,72,15,126,218
+ movq %xmm3,%rdx
adoxq %rax,%r11
adcxq %r15,%r15
adoxq %rcx,%r12
@@ -2518,9 +2518,9 @@
movdqa %xmm1,96+16(%rsp)
leaq 32(%rdi),%r10
leaq 64(%rdi),%r11
-.byte 102,72,15,110,199
-.byte 102,73,15,110,202
-.byte 102,73,15,110,211
+ movq %rdi,%xmm0
+ movq %r10,%xmm1
+ movq %r11,%xmm2
leaq 0(%rsp),%rdi
call __ecp_nistz256_mul_by_2q
@@ -2548,7 +2548,7 @@
movq 64+24(%rbx),%r12
leaq 64-0(%rbx),%rsi
leaq 32(%rbx),%rbx
-.byte 102,72,15,126,215
+ movq %xmm2,%rdi
call __ecp_nistz256_mul_montq
call __ecp_nistz256_mul_by_2q
@@ -2573,7 +2573,7 @@
leaq 0+0(%rsp),%rsi
movq 16+0(%rsp),%r15
movq 24+0(%rsp),%r8
-.byte 102,72,15,126,207
+ movq %xmm1,%rdi
call __ecp_nistz256_sqr_montq
xorq %r9,%r9
movq %r12,%rax
@@ -2648,7 +2648,7 @@
leaq 0+32(%rsp),%rsi
movq 16+32(%rsp),%r15
movq 24+32(%rsp),%r8
-.byte 102,72,15,126,199
+ movq %xmm0,%rdi
call __ecp_nistz256_sqr_montq
leaq 128(%rsp),%rbx
@@ -2681,8 +2681,8 @@
leaq 0(%rsp),%rdi
call __ecp_nistz256_mul_montq
-.byte 102,72,15,126,203
-.byte 102,72,15,126,207
+ movq %xmm1,%rbx
+ movq %xmm1,%rdi
call __ecp_nistz256_sub_fromq
leaq 160+56(%rsp),%rsi
@@ -2770,7 +2770,7 @@
por %xmm4,%xmm5
pxor %xmm4,%xmm4
por %xmm0,%xmm1
-.byte 102,72,15,110,199
+ movq %rdi,%xmm0
leaq 64-0(%rsi),%rsi
movq %rax,544+0(%rsp)
@@ -2793,7 +2793,7 @@
movq 64+8(%rbx),%r14
movq 64+16(%rbx),%r15
movq 64+24(%rbx),%r8
-.byte 102,72,15,110,203
+ movq %rbx,%xmm1
leaq 64-0(%rbx),%rsi
leaq 32(%rsp),%rdi
@@ -2848,7 +2848,7 @@
orq %r8,%r12
orq %r9,%r12
por %xmm5,%xmm2
-.byte 102,73,15,110,220
+ movq %r12,%xmm3
movq 384(%rsp),%rax
leaq 384(%rsp),%rbx
@@ -2878,8 +2878,8 @@
orq %r8,%r12
orq %r9,%r12
-.byte 102,73,15,126,208
-.byte 102,73,15,126,217
+ movq %xmm2,%r8
+ movq %xmm3,%r9
orq %r8,%r12
.byte 0x3e
jnz .Ladd_proceedq
@@ -2894,7 +2894,7 @@
-.byte 102,72,15,126,199
+ movq %xmm0,%rdi
pxor %xmm0,%xmm0
movdqu %xmm0,0(%rdi)
movdqu %xmm0,16(%rdi)
@@ -2906,8 +2906,8 @@
.align 32
.Ladd_doubleq:
-.byte 102,72,15,126,206
-.byte 102,72,15,126,199
+ movq %xmm1,%rsi
+ movq %xmm0,%rdi
addq $416,%rsp
.cfi_adjust_cfa_offset -416
jmp .Lpoint_double_shortcutq
@@ -3043,7 +3043,7 @@
leaq 320(%rsp),%rdi
call __ecp_nistz256_sub_fromq
-.byte 102,72,15,126,199
+ movq %xmm0,%rdi
movdqa %xmm5,%xmm0
movdqa %xmm5,%xmm1
@@ -3196,7 +3196,7 @@
pshufd $0x1e,%xmm5,%xmm4
movdqa %xmm1,416+16(%rsp)
por %xmm0,%xmm1
-.byte 102,72,15,110,199
+ movq %rdi,%xmm0
movdqa %xmm2,448(%rsp)
movdqa %xmm3,448+16(%rsp)
por %xmm2,%xmm3
@@ -3374,7 +3374,7 @@
leaq 256(%rsp),%rdi
call __ecp_nistz256_sub_fromq
-.byte 102,72,15,126,199
+ movq %xmm0,%rdi
movdqa %xmm5,%xmm0
movdqa %xmm5,%xmm1
@@ -3646,9 +3646,9 @@
movdqa %xmm1,96+16(%rsp)
leaq 32(%rdi),%r10
leaq 64(%rdi),%r11
-.byte 102,72,15,110,199
-.byte 102,73,15,110,202
-.byte 102,73,15,110,211
+ movq %rdi,%xmm0
+ movq %r10,%xmm1
+ movq %r11,%xmm2
leaq 0(%rsp),%rdi
call __ecp_nistz256_mul_by_2x
@@ -3676,7 +3676,7 @@
movq 64+24(%rbx),%r12
leaq 64-128(%rbx),%rsi
leaq 32(%rbx),%rbx
-.byte 102,72,15,126,215
+ movq %xmm2,%rdi
call __ecp_nistz256_mul_montx
call __ecp_nistz256_mul_by_2x
@@ -3701,7 +3701,7 @@
leaq -128+0(%rsp),%rsi
movq 16+0(%rsp),%r15
movq 24+0(%rsp),%r8
-.byte 102,72,15,126,207
+ movq %xmm1,%rdi
call __ecp_nistz256_sqr_montx
xorq %r9,%r9
movq %r12,%rax
@@ -3776,7 +3776,7 @@
leaq -128+32(%rsp),%rsi
movq 16+32(%rsp),%r15
movq 24+32(%rsp),%r8
-.byte 102,72,15,126,199
+ movq %xmm0,%rdi
call __ecp_nistz256_sqr_montx
leaq 128(%rsp),%rbx
@@ -3809,8 +3809,8 @@
leaq 0(%rsp),%rdi
call __ecp_nistz256_mul_montx
-.byte 102,72,15,126,203
-.byte 102,72,15,126,207
+ movq %xmm1,%rbx
+ movq %xmm1,%rdi
call __ecp_nistz256_sub_fromx
leaq 160+56(%rsp),%rsi
@@ -3898,7 +3898,7 @@
por %xmm4,%xmm5
pxor %xmm4,%xmm4
por %xmm0,%xmm1
-.byte 102,72,15,110,199
+ movq %rdi,%xmm0
leaq 64-128(%rsi),%rsi
movq %rdx,544+0(%rsp)
@@ -3921,7 +3921,7 @@
movq 64+8(%rbx),%r14
movq 64+16(%rbx),%r15
movq 64+24(%rbx),%r8
-.byte 102,72,15,110,203
+ movq %rbx,%xmm1
leaq 64-128(%rbx),%rsi
leaq 32(%rsp),%rdi
@@ -3976,7 +3976,7 @@
orq %r8,%r12
orq %r9,%r12
por %xmm5,%xmm2
-.byte 102,73,15,110,220
+ movq %r12,%xmm3
movq 384(%rsp),%rdx
leaq 384(%rsp),%rbx
@@ -4006,8 +4006,8 @@
orq %r8,%r12
orq %r9,%r12
-.byte 102,73,15,126,208
-.byte 102,73,15,126,217
+ movq %xmm2,%r8
+ movq %xmm3,%r9
orq %r8,%r12
.byte 0x3e
jnz .Ladd_proceedx
@@ -4022,7 +4022,7 @@
-.byte 102,72,15,126,199
+ movq %xmm0,%rdi
pxor %xmm0,%xmm0
movdqu %xmm0,0(%rdi)
movdqu %xmm0,16(%rdi)
@@ -4034,8 +4034,8 @@
.align 32
.Ladd_doublex:
-.byte 102,72,15,126,206
-.byte 102,72,15,126,199
+ movq %xmm1,%rsi
+ movq %xmm0,%rdi
addq $416,%rsp
.cfi_adjust_cfa_offset -416
jmp .Lpoint_double_shortcutx
@@ -4171,7 +4171,7 @@
leaq 320(%rsp),%rdi
call __ecp_nistz256_sub_fromx
-.byte 102,72,15,126,199
+ movq %xmm0,%rdi
movdqa %xmm5,%xmm0
movdqa %xmm5,%xmm1
@@ -4324,7 +4324,7 @@
pshufd $0x1e,%xmm5,%xmm4
movdqa %xmm1,416+16(%rsp)
por %xmm0,%xmm1
-.byte 102,72,15,110,199
+ movq %rdi,%xmm0
movdqa %xmm2,448(%rsp)
movdqa %xmm3,448+16(%rsp)
por %xmm2,%xmm3
@@ -4502,7 +4502,7 @@
leaq 256(%rsp),%rdi
call __ecp_nistz256_sub_fromx
-.byte 102,72,15,126,199
+ movq %xmm0,%rdi
movdqa %xmm5,%xmm0
movdqa %xmm5,%xmm1
diff --git a/gen/bcm/p256-x86_64-asm-win.asm b/gen/bcm/p256-x86_64-asm-win.asm
index 10a1c0d..194df1c 100644
--- a/gen/bcm/p256-x86_64-asm-win.asm
+++ b/gen/bcm/p256-x86_64-asm-win.asm
@@ -486,21 +486,21 @@
mov rbp,rax
mul r8
mov r9,rax
-DB 102,72,15,110,205
+ movq xmm1,rbp
mov rax,r14
mov r10,rdx
mul r8
add r10,rax
mov rax,r15
-DB 102,73,15,110,214
+ movq xmm2,r14
adc rdx,0
mov r11,rdx
mul r8
add r11,rax
mov rax,r15
-DB 102,73,15,110,223
+ movq xmm3,r15
adc rdx,0
mov r12,rdx
@@ -539,20 +539,20 @@
mul rax
mov r8,rax
-DB 102,72,15,126,200
+ movq rax,xmm1
mov rbp,rdx
mul rax
add r9,rbp
adc r10,rax
-DB 102,72,15,126,208
+ movq rax,xmm2
adc rdx,0
mov rbp,rdx
mul rax
add r11,rbp
adc r12,rax
-DB 102,72,15,126,216
+ movq rax,xmm3
adc rdx,0
mov rbp,rdx
@@ -1033,11 +1033,11 @@
mulx r10,r9,r14
mulx r11,rcx,r15
mov rax,rdx
-DB 102,73,15,110,206
+ movq xmm1,r14
mulx r12,rbp,r8
mov rdx,r14
add r10,rcx
-DB 102,73,15,110,215
+ movq xmm2,r15
adc r11,rbp
adc r12,0
xor r13,r13
@@ -1054,7 +1054,7 @@
mulx r14,rcx,r8
mov rdx,rax
-DB 102,73,15,110,216
+ movq xmm3,r8
xor r15,r15
adcx r9,r9
adox r13,rcx
@@ -1063,18 +1063,18 @@
mulx rbp,r8,rdx
-DB 102,72,15,126,202
+ movq rdx,xmm1
adcx r11,r11
adox r9,rbp
adcx r12,r12
mulx rax,rcx,rdx
-DB 102,72,15,126,210
+ movq rdx,xmm2
adcx r13,r13
adox r10,rcx
adcx r14,r14
mulx rbp,rcx,rdx
DB 0x67
-DB 102,72,15,126,218
+ movq rdx,xmm3
adox r11,rax
adcx r15,r15
adox r12,rcx
@@ -2659,9 +2659,9 @@
movdqa XMMWORD[(96+16)+rsp],xmm1
lea r10,[32+rdi]
lea r11,[64+rdi]
-DB 102,72,15,110,199
-DB 102,73,15,110,202
-DB 102,73,15,110,211
+ movq xmm0,rdi
+ movq xmm1,r10
+ movq xmm2,r11
lea rdi,[rsp]
call __ecp_nistz256_mul_by_2q
@@ -2689,7 +2689,7 @@
mov r12,QWORD[((64+24))+rbx]
lea rsi,[((64-0))+rbx]
lea rbx,[32+rbx]
-DB 102,72,15,126,215
+ movq rdi,xmm2
call __ecp_nistz256_mul_montq
call __ecp_nistz256_mul_by_2q
@@ -2714,7 +2714,7 @@
lea rsi,[((0+0))+rsp]
mov r15,QWORD[((16+0))+rsp]
mov r8,QWORD[((24+0))+rsp]
-DB 102,72,15,126,207
+ movq rdi,xmm1
call __ecp_nistz256_sqr_montq
xor r9,r9
mov rax,r12
@@ -2789,7 +2789,7 @@
lea rsi,[((0+32))+rsp]
mov r15,QWORD[((16+32))+rsp]
mov r8,QWORD[((24+32))+rsp]
-DB 102,72,15,126,199
+ movq rdi,xmm0
call __ecp_nistz256_sqr_montq
lea rbx,[128+rsp]
@@ -2822,8 +2822,8 @@
lea rdi,[rsp]
call __ecp_nistz256_mul_montq
-DB 102,72,15,126,203
-DB 102,72,15,126,207
+ movq rbx,xmm1
+ movq rdi,xmm1
call __ecp_nistz256_sub_fromq
lea rsi,[((160+56))+rsp]
@@ -2915,7 +2915,7 @@
por xmm5,xmm4
pxor xmm4,xmm4
por xmm1,xmm0
-DB 102,72,15,110,199
+ movq xmm0,rdi
lea rsi,[((64-0))+rsi]
mov QWORD[((544+0))+rsp],rax
@@ -2938,7 +2938,7 @@
mov r14,QWORD[((64+8))+rbx]
mov r15,QWORD[((64+16))+rbx]
mov r8,QWORD[((64+24))+rbx]
-DB 102,72,15,110,203
+ movq xmm1,rbx
lea rsi,[((64-0))+rbx]
lea rdi,[32+rsp]
@@ -2993,7 +2993,7 @@
or r12,r8
or r12,r9
por xmm2,xmm5
-DB 102,73,15,110,220
+ movq xmm3,r12
mov rax,QWORD[384+rsp]
lea rbx,[384+rsp]
@@ -3023,8 +3023,8 @@
or r12,r8
or r12,r9
-DB 102,73,15,126,208
-DB 102,73,15,126,217
+ movq r8,xmm2
+ movq r9,xmm3
or r12,r8
DB 0x3e
jnz NEAR $L$add_proceedq
@@ -3039,7 +3039,7 @@
-DB 102,72,15,126,199
+ movq rdi,xmm0
pxor xmm0,xmm0
movdqu XMMWORD[rdi],xmm0
movdqu XMMWORD[16+rdi],xmm0
@@ -3051,8 +3051,8 @@
ALIGN 32
$L$add_doubleq:
-DB 102,72,15,126,206
-DB 102,72,15,126,199
+ movq rsi,xmm1
+ movq rdi,xmm0
add rsp,416
jmp NEAR $L$point_double_shortcutq
@@ -3188,7 +3188,7 @@
lea rdi,[320+rsp]
call __ecp_nistz256_sub_fromq
-DB 102,72,15,126,199
+ movq rdi,xmm0
movdqa xmm0,xmm5
movdqa xmm1,xmm5
@@ -3345,7 +3345,7 @@
pshufd xmm4,xmm5,0x1e
movdqa XMMWORD[(416+16)+rsp],xmm1
por xmm1,xmm0
-DB 102,72,15,110,199
+ movq xmm0,rdi
movdqa XMMWORD[448+rsp],xmm2
movdqa XMMWORD[(448+16)+rsp],xmm3
por xmm3,xmm2
@@ -3523,7 +3523,7 @@
lea rdi,[256+rsp]
call __ecp_nistz256_sub_fromq
-DB 102,72,15,126,199
+ movq rdi,xmm0
movdqa xmm0,xmm5
movdqa xmm1,xmm5
@@ -3798,9 +3798,9 @@
movdqa XMMWORD[(96+16)+rsp],xmm1
lea r10,[32+rdi]
lea r11,[64+rdi]
-DB 102,72,15,110,199
-DB 102,73,15,110,202
-DB 102,73,15,110,211
+ movq xmm0,rdi
+ movq xmm1,r10
+ movq xmm2,r11
lea rdi,[rsp]
call __ecp_nistz256_mul_by_2x
@@ -3828,7 +3828,7 @@
mov r12,QWORD[((64+24))+rbx]
lea rsi,[((64-128))+rbx]
lea rbx,[32+rbx]
-DB 102,72,15,126,215
+ movq rdi,xmm2
call __ecp_nistz256_mul_montx
call __ecp_nistz256_mul_by_2x
@@ -3853,7 +3853,7 @@
lea rsi,[((-128+0))+rsp]
mov r15,QWORD[((16+0))+rsp]
mov r8,QWORD[((24+0))+rsp]
-DB 102,72,15,126,207
+ movq rdi,xmm1
call __ecp_nistz256_sqr_montx
xor r9,r9
mov rax,r12
@@ -3928,7 +3928,7 @@
lea rsi,[((-128+32))+rsp]
mov r15,QWORD[((16+32))+rsp]
mov r8,QWORD[((24+32))+rsp]
-DB 102,72,15,126,199
+ movq rdi,xmm0
call __ecp_nistz256_sqr_montx
lea rbx,[128+rsp]
@@ -3961,8 +3961,8 @@
lea rdi,[rsp]
call __ecp_nistz256_mul_montx
-DB 102,72,15,126,203
-DB 102,72,15,126,207
+ movq rbx,xmm1
+ movq rdi,xmm1
call __ecp_nistz256_sub_fromx
lea rsi,[((160+56))+rsp]
@@ -4054,7 +4054,7 @@
por xmm5,xmm4
pxor xmm4,xmm4
por xmm1,xmm0
-DB 102,72,15,110,199
+ movq xmm0,rdi
lea rsi,[((64-128))+rsi]
mov QWORD[((544+0))+rsp],rdx
@@ -4077,7 +4077,7 @@
mov r14,QWORD[((64+8))+rbx]
mov r15,QWORD[((64+16))+rbx]
mov r8,QWORD[((64+24))+rbx]
-DB 102,72,15,110,203
+ movq xmm1,rbx
lea rsi,[((64-128))+rbx]
lea rdi,[32+rsp]
@@ -4132,7 +4132,7 @@
or r12,r8
or r12,r9
por xmm2,xmm5
-DB 102,73,15,110,220
+ movq xmm3,r12
mov rdx,QWORD[384+rsp]
lea rbx,[384+rsp]
@@ -4162,8 +4162,8 @@
or r12,r8
or r12,r9
-DB 102,73,15,126,208
-DB 102,73,15,126,217
+ movq r8,xmm2
+ movq r9,xmm3
or r12,r8
DB 0x3e
jnz NEAR $L$add_proceedx
@@ -4178,7 +4178,7 @@
-DB 102,72,15,126,199
+ movq rdi,xmm0
pxor xmm0,xmm0
movdqu XMMWORD[rdi],xmm0
movdqu XMMWORD[16+rdi],xmm0
@@ -4190,8 +4190,8 @@
ALIGN 32
$L$add_doublex:
-DB 102,72,15,126,206
-DB 102,72,15,126,199
+ movq rsi,xmm1
+ movq rdi,xmm0
add rsp,416
jmp NEAR $L$point_double_shortcutx
@@ -4327,7 +4327,7 @@
lea rdi,[320+rsp]
call __ecp_nistz256_sub_fromx
-DB 102,72,15,126,199
+ movq rdi,xmm0
movdqa xmm0,xmm5
movdqa xmm1,xmm5
@@ -4484,7 +4484,7 @@
pshufd xmm4,xmm5,0x1e
movdqa XMMWORD[(416+16)+rsp],xmm1
por xmm1,xmm0
-DB 102,72,15,110,199
+ movq xmm0,rdi
movdqa XMMWORD[448+rsp],xmm2
movdqa XMMWORD[(448+16)+rsp],xmm3
por xmm3,xmm2
@@ -4662,7 +4662,7 @@
lea rdi,[256+rsp]
call __ecp_nistz256_sub_fromx
-DB 102,72,15,126,199
+ movq rdi,xmm0
movdqa xmm0,xmm5
movdqa xmm1,xmm5
diff --git a/gen/bcm/rdrand-x86_64-apple.S b/gen/bcm/rdrand-x86_64-apple.S
index 5fdf105..4f990d9 100644
--- a/gen/bcm/rdrand-x86_64-apple.S
+++ b/gen/bcm/rdrand-x86_64-apple.S
@@ -17,7 +17,7 @@
_CET_ENDBR
xorq %rax,%rax
-.byte 72,15,199,242
+ rdrand %rdx
adcq %rax,%rax
movq %rdx,0(%rdi)
@@ -40,7 +40,7 @@
jz L$out
movq $8,%rdx
L$loop:
-.byte 72,15,199,241
+ rdrand %rcx
jnc L$err
movq %rcx,0(%rdi)
addq %rdx,%rdi
diff --git a/gen/bcm/rdrand-x86_64-linux.S b/gen/bcm/rdrand-x86_64-linux.S
index fe81dac..52a1eb2 100644
--- a/gen/bcm/rdrand-x86_64-linux.S
+++ b/gen/bcm/rdrand-x86_64-linux.S
@@ -17,7 +17,7 @@
.cfi_startproc
_CET_ENDBR
xorq %rax,%rax
-.byte 72,15,199,242
+ rdrand %rdx
adcq %rax,%rax
movq %rdx,0(%rdi)
@@ -40,7 +40,7 @@
jz .Lout
movq $8,%rdx
.Lloop:
-.byte 72,15,199,241
+ rdrand %rcx
jnc .Lerr
movq %rcx,0(%rdi)
addq %rdx,%rdi
diff --git a/gen/bcm/rdrand-x86_64-win.asm b/gen/bcm/rdrand-x86_64-win.asm
index aae3d76..6dba87b 100644
--- a/gen/bcm/rdrand-x86_64-win.asm
+++ b/gen/bcm/rdrand-x86_64-win.asm
@@ -24,7 +24,7 @@
_CET_ENDBR
xor rax,rax
-DB 73,15,199,240
+ rdrand r8
adc rax,rax
mov QWORD[rcx],r8
@@ -46,7 +46,7 @@
jz NEAR $L$out
mov r8,8
$L$loop:
-DB 73,15,199,241
+ rdrand r9
jnc NEAR $L$err
mov QWORD[rcx],r9
add rcx,r8
diff --git a/gen/bcm/sha1-x86_64-apple.S b/gen/bcm/sha1-x86_64-apple.S
index a1ea1e6..32b3bc7 100644
--- a/gen/bcm/sha1-x86_64-apple.S
+++ b/gen/bcm/sha1-x86_64-apple.S
@@ -1259,12 +1259,12 @@
movdqu 16(%rsi),%xmm5
pshufd $27,%xmm1,%xmm1
movdqu 32(%rsi),%xmm6
-.byte 102,15,56,0,227
+ pshufb %xmm3,%xmm4
movdqu 48(%rsi),%xmm7
-.byte 102,15,56,0,235
-.byte 102,15,56,0,243
+ pshufb %xmm3,%xmm5
+ pshufb %xmm3,%xmm6
movdqa %xmm1,%xmm9
-.byte 102,15,56,0,251
+ pshufb %xmm3,%xmm7
jmp L$oop_shaext
.p2align 4
@@ -1275,133 +1275,133 @@
cmovneq %r8,%rsi
prefetcht0 512(%rsi)
movdqa %xmm0,%xmm8
-.byte 15,56,201,229
+ sha1msg1 %xmm5,%xmm4
movdqa %xmm0,%xmm2
-.byte 15,58,204,193,0
-.byte 15,56,200,213
+ sha1rnds4 $0,%xmm1,%xmm0
+ sha1nexte %xmm5,%xmm2
pxor %xmm6,%xmm4
-.byte 15,56,201,238
-.byte 15,56,202,231
+ sha1msg1 %xmm6,%xmm5
+ sha1msg2 %xmm7,%xmm4
movdqa %xmm0,%xmm1
-.byte 15,58,204,194,0
-.byte 15,56,200,206
+ sha1rnds4 $0,%xmm2,%xmm0
+ sha1nexte %xmm6,%xmm1
pxor %xmm7,%xmm5
-.byte 15,56,202,236
-.byte 15,56,201,247
+ sha1msg2 %xmm4,%xmm5
+ sha1msg1 %xmm7,%xmm6
movdqa %xmm0,%xmm2
-.byte 15,58,204,193,0
-.byte 15,56,200,215
+ sha1rnds4 $0,%xmm1,%xmm0
+ sha1nexte %xmm7,%xmm2
pxor %xmm4,%xmm6
-.byte 15,56,201,252
-.byte 15,56,202,245
+ sha1msg1 %xmm4,%xmm7
+ sha1msg2 %xmm5,%xmm6
movdqa %xmm0,%xmm1
-.byte 15,58,204,194,0
-.byte 15,56,200,204
+ sha1rnds4 $0,%xmm2,%xmm0
+ sha1nexte %xmm4,%xmm1
pxor %xmm5,%xmm7
-.byte 15,56,202,254
-.byte 15,56,201,229
+ sha1msg2 %xmm6,%xmm7
+ sha1msg1 %xmm5,%xmm4
movdqa %xmm0,%xmm2
-.byte 15,58,204,193,0
-.byte 15,56,200,213
+ sha1rnds4 $0,%xmm1,%xmm0
+ sha1nexte %xmm5,%xmm2
pxor %xmm6,%xmm4
-.byte 15,56,201,238
-.byte 15,56,202,231
+ sha1msg1 %xmm6,%xmm5
+ sha1msg2 %xmm7,%xmm4
movdqa %xmm0,%xmm1
-.byte 15,58,204,194,1
-.byte 15,56,200,206
+ sha1rnds4 $1,%xmm2,%xmm0
+ sha1nexte %xmm6,%xmm1
pxor %xmm7,%xmm5
-.byte 15,56,202,236
-.byte 15,56,201,247
+ sha1msg2 %xmm4,%xmm5
+ sha1msg1 %xmm7,%xmm6
movdqa %xmm0,%xmm2
-.byte 15,58,204,193,1
-.byte 15,56,200,215
+ sha1rnds4 $1,%xmm1,%xmm0
+ sha1nexte %xmm7,%xmm2
pxor %xmm4,%xmm6
-.byte 15,56,201,252
-.byte 15,56,202,245
+ sha1msg1 %xmm4,%xmm7
+ sha1msg2 %xmm5,%xmm6
movdqa %xmm0,%xmm1
-.byte 15,58,204,194,1
-.byte 15,56,200,204
+ sha1rnds4 $1,%xmm2,%xmm0
+ sha1nexte %xmm4,%xmm1
pxor %xmm5,%xmm7
-.byte 15,56,202,254
-.byte 15,56,201,229
+ sha1msg2 %xmm6,%xmm7
+ sha1msg1 %xmm5,%xmm4
movdqa %xmm0,%xmm2
-.byte 15,58,204,193,1
-.byte 15,56,200,213
+ sha1rnds4 $1,%xmm1,%xmm0
+ sha1nexte %xmm5,%xmm2
pxor %xmm6,%xmm4
-.byte 15,56,201,238
-.byte 15,56,202,231
+ sha1msg1 %xmm6,%xmm5
+ sha1msg2 %xmm7,%xmm4
movdqa %xmm0,%xmm1
-.byte 15,58,204,194,1
-.byte 15,56,200,206
+ sha1rnds4 $1,%xmm2,%xmm0
+ sha1nexte %xmm6,%xmm1
pxor %xmm7,%xmm5
-.byte 15,56,202,236
-.byte 15,56,201,247
+ sha1msg2 %xmm4,%xmm5
+ sha1msg1 %xmm7,%xmm6
movdqa %xmm0,%xmm2
-.byte 15,58,204,193,2
-.byte 15,56,200,215
+ sha1rnds4 $2,%xmm1,%xmm0
+ sha1nexte %xmm7,%xmm2
pxor %xmm4,%xmm6
-.byte 15,56,201,252
-.byte 15,56,202,245
+ sha1msg1 %xmm4,%xmm7
+ sha1msg2 %xmm5,%xmm6
movdqa %xmm0,%xmm1
-.byte 15,58,204,194,2
-.byte 15,56,200,204
+ sha1rnds4 $2,%xmm2,%xmm0
+ sha1nexte %xmm4,%xmm1
pxor %xmm5,%xmm7
-.byte 15,56,202,254
-.byte 15,56,201,229
+ sha1msg2 %xmm6,%xmm7
+ sha1msg1 %xmm5,%xmm4
movdqa %xmm0,%xmm2
-.byte 15,58,204,193,2
-.byte 15,56,200,213
+ sha1rnds4 $2,%xmm1,%xmm0
+ sha1nexte %xmm5,%xmm2
pxor %xmm6,%xmm4
-.byte 15,56,201,238
-.byte 15,56,202,231
+ sha1msg1 %xmm6,%xmm5
+ sha1msg2 %xmm7,%xmm4
movdqa %xmm0,%xmm1
-.byte 15,58,204,194,2
-.byte 15,56,200,206
+ sha1rnds4 $2,%xmm2,%xmm0
+ sha1nexte %xmm6,%xmm1
pxor %xmm7,%xmm5
-.byte 15,56,202,236
-.byte 15,56,201,247
+ sha1msg2 %xmm4,%xmm5
+ sha1msg1 %xmm7,%xmm6
movdqa %xmm0,%xmm2
-.byte 15,58,204,193,2
-.byte 15,56,200,215
+ sha1rnds4 $2,%xmm1,%xmm0
+ sha1nexte %xmm7,%xmm2
pxor %xmm4,%xmm6
-.byte 15,56,201,252
-.byte 15,56,202,245
+ sha1msg1 %xmm4,%xmm7
+ sha1msg2 %xmm5,%xmm6
movdqa %xmm0,%xmm1
-.byte 15,58,204,194,3
-.byte 15,56,200,204
+ sha1rnds4 $3,%xmm2,%xmm0
+ sha1nexte %xmm4,%xmm1
pxor %xmm5,%xmm7
-.byte 15,56,202,254
+ sha1msg2 %xmm6,%xmm7
movdqu (%rsi),%xmm4
movdqa %xmm0,%xmm2
-.byte 15,58,204,193,3
-.byte 15,56,200,213
+ sha1rnds4 $3,%xmm1,%xmm0
+ sha1nexte %xmm5,%xmm2
movdqu 16(%rsi),%xmm5
-.byte 102,15,56,0,227
+ pshufb %xmm3,%xmm4
movdqa %xmm0,%xmm1
-.byte 15,58,204,194,3
-.byte 15,56,200,206
+ sha1rnds4 $3,%xmm2,%xmm0
+ sha1nexte %xmm6,%xmm1
movdqu 32(%rsi),%xmm6
-.byte 102,15,56,0,235
+ pshufb %xmm3,%xmm5
movdqa %xmm0,%xmm2
-.byte 15,58,204,193,3
-.byte 15,56,200,215
+ sha1rnds4 $3,%xmm1,%xmm0
+ sha1nexte %xmm7,%xmm2
movdqu 48(%rsi),%xmm7
-.byte 102,15,56,0,243
+ pshufb %xmm3,%xmm6
movdqa %xmm0,%xmm1
-.byte 15,58,204,194,3
-.byte 65,15,56,200,201
-.byte 102,15,56,0,251
+ sha1rnds4 $3,%xmm2,%xmm0
+ sha1nexte %xmm9,%xmm1
+ pshufb %xmm3,%xmm7
paddd %xmm8,%xmm0
movdqa %xmm1,%xmm9
@@ -1460,12 +1460,12 @@
movdqu 16(%r9),%xmm1
movdqu 32(%r9),%xmm2
movdqu 48(%r9),%xmm3
-.byte 102,15,56,0,198
-.byte 102,15,56,0,206
-.byte 102,15,56,0,214
+ pshufb %xmm6,%xmm0
+ pshufb %xmm6,%xmm1
+ pshufb %xmm6,%xmm2
addq $64,%r9
paddd %xmm9,%xmm0
-.byte 102,15,56,0,222
+ pshufb %xmm6,%xmm3
paddd %xmm9,%xmm1
paddd %xmm9,%xmm2
movdqa %xmm0,0(%rsp)
@@ -2357,12 +2357,12 @@
movdqu 16(%r9),%xmm1
movdqu 32(%r9),%xmm2
movdqu 48(%r9),%xmm3
-.byte 102,15,56,0,198
+ pshufb %xmm6,%xmm0
addq $64,%r9
addl 16(%rsp),%ebx
xorl %ebp,%esi
movl %ecx,%edi
-.byte 102,15,56,0,206
+ pshufb %xmm6,%xmm1
roll $5,%ecx
addl %esi,%ebx
xorl %ebp,%edi
@@ -2398,7 +2398,7 @@
addl 32(%rsp),%ecx
xorl %eax,%esi
movl %edx,%edi
-.byte 102,15,56,0,214
+ pshufb %xmm6,%xmm2
roll $5,%edx
addl %esi,%ecx
xorl %eax,%edi
@@ -2434,7 +2434,7 @@
addl 48(%rsp),%edx
xorl %ebx,%esi
movl %ebp,%edi
-.byte 102,15,56,0,222
+ pshufb %xmm6,%xmm3
roll $5,%ebp
addl %esi,%edx
xorl %ebx,%edi
diff --git a/gen/bcm/sha1-x86_64-linux.S b/gen/bcm/sha1-x86_64-linux.S
index 39d9ad3..1f4807a 100644
--- a/gen/bcm/sha1-x86_64-linux.S
+++ b/gen/bcm/sha1-x86_64-linux.S
@@ -1259,12 +1259,12 @@
movdqu 16(%rsi),%xmm5
pshufd $27,%xmm1,%xmm1
movdqu 32(%rsi),%xmm6
-.byte 102,15,56,0,227
+ pshufb %xmm3,%xmm4
movdqu 48(%rsi),%xmm7
-.byte 102,15,56,0,235
-.byte 102,15,56,0,243
+ pshufb %xmm3,%xmm5
+ pshufb %xmm3,%xmm6
movdqa %xmm1,%xmm9
-.byte 102,15,56,0,251
+ pshufb %xmm3,%xmm7
jmp .Loop_shaext
.align 16
@@ -1275,133 +1275,133 @@
cmovneq %r8,%rsi
prefetcht0 512(%rsi)
movdqa %xmm0,%xmm8
-.byte 15,56,201,229
+ sha1msg1 %xmm5,%xmm4
movdqa %xmm0,%xmm2
-.byte 15,58,204,193,0
-.byte 15,56,200,213
+ sha1rnds4 $0,%xmm1,%xmm0
+ sha1nexte %xmm5,%xmm2
pxor %xmm6,%xmm4
-.byte 15,56,201,238
-.byte 15,56,202,231
+ sha1msg1 %xmm6,%xmm5
+ sha1msg2 %xmm7,%xmm4
movdqa %xmm0,%xmm1
-.byte 15,58,204,194,0
-.byte 15,56,200,206
+ sha1rnds4 $0,%xmm2,%xmm0
+ sha1nexte %xmm6,%xmm1
pxor %xmm7,%xmm5
-.byte 15,56,202,236
-.byte 15,56,201,247
+ sha1msg2 %xmm4,%xmm5
+ sha1msg1 %xmm7,%xmm6
movdqa %xmm0,%xmm2
-.byte 15,58,204,193,0
-.byte 15,56,200,215
+ sha1rnds4 $0,%xmm1,%xmm0
+ sha1nexte %xmm7,%xmm2
pxor %xmm4,%xmm6
-.byte 15,56,201,252
-.byte 15,56,202,245
+ sha1msg1 %xmm4,%xmm7
+ sha1msg2 %xmm5,%xmm6
movdqa %xmm0,%xmm1
-.byte 15,58,204,194,0
-.byte 15,56,200,204
+ sha1rnds4 $0,%xmm2,%xmm0
+ sha1nexte %xmm4,%xmm1
pxor %xmm5,%xmm7
-.byte 15,56,202,254
-.byte 15,56,201,229
+ sha1msg2 %xmm6,%xmm7
+ sha1msg1 %xmm5,%xmm4
movdqa %xmm0,%xmm2
-.byte 15,58,204,193,0
-.byte 15,56,200,213
+ sha1rnds4 $0,%xmm1,%xmm0
+ sha1nexte %xmm5,%xmm2
pxor %xmm6,%xmm4
-.byte 15,56,201,238
-.byte 15,56,202,231
+ sha1msg1 %xmm6,%xmm5
+ sha1msg2 %xmm7,%xmm4
movdqa %xmm0,%xmm1
-.byte 15,58,204,194,1
-.byte 15,56,200,206
+ sha1rnds4 $1,%xmm2,%xmm0
+ sha1nexte %xmm6,%xmm1
pxor %xmm7,%xmm5
-.byte 15,56,202,236
-.byte 15,56,201,247
+ sha1msg2 %xmm4,%xmm5
+ sha1msg1 %xmm7,%xmm6
movdqa %xmm0,%xmm2
-.byte 15,58,204,193,1
-.byte 15,56,200,215
+ sha1rnds4 $1,%xmm1,%xmm0
+ sha1nexte %xmm7,%xmm2
pxor %xmm4,%xmm6
-.byte 15,56,201,252
-.byte 15,56,202,245
+ sha1msg1 %xmm4,%xmm7
+ sha1msg2 %xmm5,%xmm6
movdqa %xmm0,%xmm1
-.byte 15,58,204,194,1
-.byte 15,56,200,204
+ sha1rnds4 $1,%xmm2,%xmm0
+ sha1nexte %xmm4,%xmm1
pxor %xmm5,%xmm7
-.byte 15,56,202,254
-.byte 15,56,201,229
+ sha1msg2 %xmm6,%xmm7
+ sha1msg1 %xmm5,%xmm4
movdqa %xmm0,%xmm2
-.byte 15,58,204,193,1
-.byte 15,56,200,213
+ sha1rnds4 $1,%xmm1,%xmm0
+ sha1nexte %xmm5,%xmm2
pxor %xmm6,%xmm4
-.byte 15,56,201,238
-.byte 15,56,202,231
+ sha1msg1 %xmm6,%xmm5
+ sha1msg2 %xmm7,%xmm4
movdqa %xmm0,%xmm1
-.byte 15,58,204,194,1
-.byte 15,56,200,206
+ sha1rnds4 $1,%xmm2,%xmm0
+ sha1nexte %xmm6,%xmm1
pxor %xmm7,%xmm5
-.byte 15,56,202,236
-.byte 15,56,201,247
+ sha1msg2 %xmm4,%xmm5
+ sha1msg1 %xmm7,%xmm6
movdqa %xmm0,%xmm2
-.byte 15,58,204,193,2
-.byte 15,56,200,215
+ sha1rnds4 $2,%xmm1,%xmm0
+ sha1nexte %xmm7,%xmm2
pxor %xmm4,%xmm6
-.byte 15,56,201,252
-.byte 15,56,202,245
+ sha1msg1 %xmm4,%xmm7
+ sha1msg2 %xmm5,%xmm6
movdqa %xmm0,%xmm1
-.byte 15,58,204,194,2
-.byte 15,56,200,204
+ sha1rnds4 $2,%xmm2,%xmm0
+ sha1nexte %xmm4,%xmm1
pxor %xmm5,%xmm7
-.byte 15,56,202,254
-.byte 15,56,201,229
+ sha1msg2 %xmm6,%xmm7
+ sha1msg1 %xmm5,%xmm4
movdqa %xmm0,%xmm2
-.byte 15,58,204,193,2
-.byte 15,56,200,213
+ sha1rnds4 $2,%xmm1,%xmm0
+ sha1nexte %xmm5,%xmm2
pxor %xmm6,%xmm4
-.byte 15,56,201,238
-.byte 15,56,202,231
+ sha1msg1 %xmm6,%xmm5
+ sha1msg2 %xmm7,%xmm4
movdqa %xmm0,%xmm1
-.byte 15,58,204,194,2
-.byte 15,56,200,206
+ sha1rnds4 $2,%xmm2,%xmm0
+ sha1nexte %xmm6,%xmm1
pxor %xmm7,%xmm5
-.byte 15,56,202,236
-.byte 15,56,201,247
+ sha1msg2 %xmm4,%xmm5
+ sha1msg1 %xmm7,%xmm6
movdqa %xmm0,%xmm2
-.byte 15,58,204,193,2
-.byte 15,56,200,215
+ sha1rnds4 $2,%xmm1,%xmm0
+ sha1nexte %xmm7,%xmm2
pxor %xmm4,%xmm6
-.byte 15,56,201,252
-.byte 15,56,202,245
+ sha1msg1 %xmm4,%xmm7
+ sha1msg2 %xmm5,%xmm6
movdqa %xmm0,%xmm1
-.byte 15,58,204,194,3
-.byte 15,56,200,204
+ sha1rnds4 $3,%xmm2,%xmm0
+ sha1nexte %xmm4,%xmm1
pxor %xmm5,%xmm7
-.byte 15,56,202,254
+ sha1msg2 %xmm6,%xmm7
movdqu (%rsi),%xmm4
movdqa %xmm0,%xmm2
-.byte 15,58,204,193,3
-.byte 15,56,200,213
+ sha1rnds4 $3,%xmm1,%xmm0
+ sha1nexte %xmm5,%xmm2
movdqu 16(%rsi),%xmm5
-.byte 102,15,56,0,227
+ pshufb %xmm3,%xmm4
movdqa %xmm0,%xmm1
-.byte 15,58,204,194,3
-.byte 15,56,200,206
+ sha1rnds4 $3,%xmm2,%xmm0
+ sha1nexte %xmm6,%xmm1
movdqu 32(%rsi),%xmm6
-.byte 102,15,56,0,235
+ pshufb %xmm3,%xmm5
movdqa %xmm0,%xmm2
-.byte 15,58,204,193,3
-.byte 15,56,200,215
+ sha1rnds4 $3,%xmm1,%xmm0
+ sha1nexte %xmm7,%xmm2
movdqu 48(%rsi),%xmm7
-.byte 102,15,56,0,243
+ pshufb %xmm3,%xmm6
movdqa %xmm0,%xmm1
-.byte 15,58,204,194,3
-.byte 65,15,56,200,201
-.byte 102,15,56,0,251
+ sha1rnds4 $3,%xmm2,%xmm0
+ sha1nexte %xmm9,%xmm1
+ pshufb %xmm3,%xmm7
paddd %xmm8,%xmm0
movdqa %xmm1,%xmm9
@@ -1460,12 +1460,12 @@
movdqu 16(%r9),%xmm1
movdqu 32(%r9),%xmm2
movdqu 48(%r9),%xmm3
-.byte 102,15,56,0,198
-.byte 102,15,56,0,206
-.byte 102,15,56,0,214
+ pshufb %xmm6,%xmm0
+ pshufb %xmm6,%xmm1
+ pshufb %xmm6,%xmm2
addq $64,%r9
paddd %xmm9,%xmm0
-.byte 102,15,56,0,222
+ pshufb %xmm6,%xmm3
paddd %xmm9,%xmm1
paddd %xmm9,%xmm2
movdqa %xmm0,0(%rsp)
@@ -2357,12 +2357,12 @@
movdqu 16(%r9),%xmm1
movdqu 32(%r9),%xmm2
movdqu 48(%r9),%xmm3
-.byte 102,15,56,0,198
+ pshufb %xmm6,%xmm0
addq $64,%r9
addl 16(%rsp),%ebx
xorl %ebp,%esi
movl %ecx,%edi
-.byte 102,15,56,0,206
+ pshufb %xmm6,%xmm1
roll $5,%ecx
addl %esi,%ebx
xorl %ebp,%edi
@@ -2398,7 +2398,7 @@
addl 32(%rsp),%ecx
xorl %eax,%esi
movl %edx,%edi
-.byte 102,15,56,0,214
+ pshufb %xmm6,%xmm2
roll $5,%edx
addl %esi,%ecx
xorl %eax,%edi
@@ -2434,7 +2434,7 @@
addl 48(%rsp),%edx
xorl %ebx,%esi
movl %ebp,%edi
-.byte 102,15,56,0,222
+ pshufb %xmm6,%xmm3
roll $5,%ebp
addl %esi,%edx
xorl %ebx,%edi
diff --git a/gen/bcm/sha1-x86_64-win.asm b/gen/bcm/sha1-x86_64-win.asm
index 92e9b9c..c5da333 100644
--- a/gen/bcm/sha1-x86_64-win.asm
+++ b/gen/bcm/sha1-x86_64-win.asm
@@ -1291,12 +1291,12 @@
movdqu xmm5,XMMWORD[16+rsi]
pshufd xmm1,xmm1,27
movdqu xmm6,XMMWORD[32+rsi]
-DB 102,15,56,0,227
+ pshufb xmm4,xmm3
movdqu xmm7,XMMWORD[48+rsi]
-DB 102,15,56,0,235
-DB 102,15,56,0,243
+ pshufb xmm5,xmm3
+ pshufb xmm6,xmm3
movdqa xmm9,xmm1
-DB 102,15,56,0,251
+ pshufb xmm7,xmm3
jmp NEAR $L$oop_shaext
ALIGN 16
@@ -1307,133 +1307,133 @@
cmovne rsi,r8
prefetcht0 [512+rsi]
movdqa xmm8,xmm0
- DB 15,56,201,229
+ sha1msg1 xmm4,xmm5
movdqa xmm2,xmm0
- DB 15,58,204,193,0
- DB 15,56,200,213
+ sha1rnds4 xmm0,xmm1,0
+ sha1nexte xmm2,xmm5
pxor xmm4,xmm6
- DB 15,56,201,238
- DB 15,56,202,231
+ sha1msg1 xmm5,xmm6
+ sha1msg2 xmm4,xmm7
movdqa xmm1,xmm0
- DB 15,58,204,194,0
- DB 15,56,200,206
+ sha1rnds4 xmm0,xmm2,0
+ sha1nexte xmm1,xmm6
pxor xmm5,xmm7
- DB 15,56,202,236
- DB 15,56,201,247
+ sha1msg2 xmm5,xmm4
+ sha1msg1 xmm6,xmm7
movdqa xmm2,xmm0
- DB 15,58,204,193,0
- DB 15,56,200,215
+ sha1rnds4 xmm0,xmm1,0
+ sha1nexte xmm2,xmm7
pxor xmm6,xmm4
- DB 15,56,201,252
- DB 15,56,202,245
+ sha1msg1 xmm7,xmm4
+ sha1msg2 xmm6,xmm5
movdqa xmm1,xmm0
- DB 15,58,204,194,0
- DB 15,56,200,204
+ sha1rnds4 xmm0,xmm2,0
+ sha1nexte xmm1,xmm4
pxor xmm7,xmm5
- DB 15,56,202,254
- DB 15,56,201,229
+ sha1msg2 xmm7,xmm6
+ sha1msg1 xmm4,xmm5
movdqa xmm2,xmm0
- DB 15,58,204,193,0
- DB 15,56,200,213
+ sha1rnds4 xmm0,xmm1,0
+ sha1nexte xmm2,xmm5
pxor xmm4,xmm6
- DB 15,56,201,238
- DB 15,56,202,231
+ sha1msg1 xmm5,xmm6
+ sha1msg2 xmm4,xmm7
movdqa xmm1,xmm0
- DB 15,58,204,194,1
- DB 15,56,200,206
+ sha1rnds4 xmm0,xmm2,1
+ sha1nexte xmm1,xmm6
pxor xmm5,xmm7
- DB 15,56,202,236
- DB 15,56,201,247
+ sha1msg2 xmm5,xmm4
+ sha1msg1 xmm6,xmm7
movdqa xmm2,xmm0
- DB 15,58,204,193,1
- DB 15,56,200,215
+ sha1rnds4 xmm0,xmm1,1
+ sha1nexte xmm2,xmm7
pxor xmm6,xmm4
- DB 15,56,201,252
- DB 15,56,202,245
+ sha1msg1 xmm7,xmm4
+ sha1msg2 xmm6,xmm5
movdqa xmm1,xmm0
- DB 15,58,204,194,1
- DB 15,56,200,204
+ sha1rnds4 xmm0,xmm2,1
+ sha1nexte xmm1,xmm4
pxor xmm7,xmm5
- DB 15,56,202,254
- DB 15,56,201,229
+ sha1msg2 xmm7,xmm6
+ sha1msg1 xmm4,xmm5
movdqa xmm2,xmm0
- DB 15,58,204,193,1
- DB 15,56,200,213
+ sha1rnds4 xmm0,xmm1,1
+ sha1nexte xmm2,xmm5
pxor xmm4,xmm6
- DB 15,56,201,238
- DB 15,56,202,231
+ sha1msg1 xmm5,xmm6
+ sha1msg2 xmm4,xmm7
movdqa xmm1,xmm0
- DB 15,58,204,194,1
- DB 15,56,200,206
+ sha1rnds4 xmm0,xmm2,1
+ sha1nexte xmm1,xmm6
pxor xmm5,xmm7
- DB 15,56,202,236
- DB 15,56,201,247
+ sha1msg2 xmm5,xmm4
+ sha1msg1 xmm6,xmm7
movdqa xmm2,xmm0
- DB 15,58,204,193,2
- DB 15,56,200,215
+ sha1rnds4 xmm0,xmm1,2
+ sha1nexte xmm2,xmm7
pxor xmm6,xmm4
- DB 15,56,201,252
- DB 15,56,202,245
+ sha1msg1 xmm7,xmm4
+ sha1msg2 xmm6,xmm5
movdqa xmm1,xmm0
- DB 15,58,204,194,2
- DB 15,56,200,204
+ sha1rnds4 xmm0,xmm2,2
+ sha1nexte xmm1,xmm4
pxor xmm7,xmm5
- DB 15,56,202,254
- DB 15,56,201,229
+ sha1msg2 xmm7,xmm6
+ sha1msg1 xmm4,xmm5
movdqa xmm2,xmm0
- DB 15,58,204,193,2
- DB 15,56,200,213
+ sha1rnds4 xmm0,xmm1,2
+ sha1nexte xmm2,xmm5
pxor xmm4,xmm6
- DB 15,56,201,238
- DB 15,56,202,231
+ sha1msg1 xmm5,xmm6
+ sha1msg2 xmm4,xmm7
movdqa xmm1,xmm0
- DB 15,58,204,194,2
- DB 15,56,200,206
+ sha1rnds4 xmm0,xmm2,2
+ sha1nexte xmm1,xmm6
pxor xmm5,xmm7
- DB 15,56,202,236
- DB 15,56,201,247
+ sha1msg2 xmm5,xmm4
+ sha1msg1 xmm6,xmm7
movdqa xmm2,xmm0
- DB 15,58,204,193,2
- DB 15,56,200,215
+ sha1rnds4 xmm0,xmm1,2
+ sha1nexte xmm2,xmm7
pxor xmm6,xmm4
- DB 15,56,201,252
- DB 15,56,202,245
+ sha1msg1 xmm7,xmm4
+ sha1msg2 xmm6,xmm5
movdqa xmm1,xmm0
- DB 15,58,204,194,3
- DB 15,56,200,204
+ sha1rnds4 xmm0,xmm2,3
+ sha1nexte xmm1,xmm4
pxor xmm7,xmm5
- DB 15,56,202,254
+ sha1msg2 xmm7,xmm6
movdqu xmm4,XMMWORD[rsi]
movdqa xmm2,xmm0
- DB 15,58,204,193,3
- DB 15,56,200,213
+ sha1rnds4 xmm0,xmm1,3
+ sha1nexte xmm2,xmm5
movdqu xmm5,XMMWORD[16+rsi]
-DB 102,15,56,0,227
+ pshufb xmm4,xmm3
movdqa xmm1,xmm0
- DB 15,58,204,194,3
- DB 15,56,200,206
+ sha1rnds4 xmm0,xmm2,3
+ sha1nexte xmm1,xmm6
movdqu xmm6,XMMWORD[32+rsi]
-DB 102,15,56,0,235
+ pshufb xmm5,xmm3
movdqa xmm2,xmm0
- DB 15,58,204,193,3
- DB 15,56,200,215
+ sha1rnds4 xmm0,xmm1,3
+ sha1nexte xmm2,xmm7
movdqu xmm7,XMMWORD[48+rsi]
-DB 102,15,56,0,243
+ pshufb xmm6,xmm3
movdqa xmm1,xmm0
- DB 15,58,204,194,3
- DB 65,15,56,200,201
-DB 102,15,56,0,251
+ sha1rnds4 xmm0,xmm2,3
+ sha1nexte xmm1,xmm9
+ pshufb xmm7,xmm3
paddd xmm0,xmm8
movdqa xmm9,xmm1
@@ -1515,12 +1515,12 @@
movdqu xmm1,XMMWORD[16+r9]
movdqu xmm2,XMMWORD[32+r9]
movdqu xmm3,XMMWORD[48+r9]
-DB 102,15,56,0,198
-DB 102,15,56,0,206
-DB 102,15,56,0,214
+ pshufb xmm0,xmm6
+ pshufb xmm1,xmm6
+ pshufb xmm2,xmm6
add r9,64
paddd xmm0,xmm9
-DB 102,15,56,0,222
+ pshufb xmm3,xmm6
paddd xmm1,xmm9
paddd xmm2,xmm9
movdqa XMMWORD[rsp],xmm0
@@ -2412,12 +2412,12 @@
movdqu xmm1,XMMWORD[16+r9]
movdqu xmm2,XMMWORD[32+r9]
movdqu xmm3,XMMWORD[48+r9]
-DB 102,15,56,0,198
+ pshufb xmm0,xmm6
add r9,64
add ebx,DWORD[16+rsp]
xor esi,ebp
mov edi,ecx
-DB 102,15,56,0,206
+ pshufb xmm1,xmm6
rol ecx,5
add ebx,esi
xor edi,ebp
@@ -2453,7 +2453,7 @@
add ecx,DWORD[32+rsp]
xor esi,eax
mov edi,edx
-DB 102,15,56,0,214
+ pshufb xmm2,xmm6
rol edx,5
add ecx,esi
xor edi,eax
@@ -2489,7 +2489,7 @@
add edx,DWORD[48+rsp]
xor esi,ebx
mov edi,ebp
-DB 102,15,56,0,222
+ pshufb xmm3,xmm6
rol ebp,5
add edx,esi
xor edi,ebx
diff --git a/gen/bcm/sha256-x86_64-apple.S b/gen/bcm/sha256-x86_64-apple.S
index b33f807..367f0d3 100644
--- a/gen/bcm/sha256-x86_64-apple.S
+++ b/gen/bcm/sha256-x86_64-apple.S
@@ -1780,7 +1780,7 @@
pshufd $0xb1,%xmm1,%xmm1
pshufd $0x1b,%xmm2,%xmm2
movdqa %xmm7,%xmm8
-.byte 102,15,58,15,202,8
+ palignr $8,%xmm2,%xmm1
punpcklqdq %xmm0,%xmm2
jmp L$oop_shaext
@@ -1789,176 +1789,176 @@
movdqu (%rsi),%xmm3
movdqu 16(%rsi),%xmm4
movdqu 32(%rsi),%xmm5
-.byte 102,15,56,0,223
+ pshufb %xmm7,%xmm3
movdqu 48(%rsi),%xmm6
movdqa 0-128(%rcx),%xmm0
paddd %xmm3,%xmm0
-.byte 102,15,56,0,231
+ pshufb %xmm7,%xmm4
movdqa %xmm2,%xmm10
-.byte 15,56,203,209
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
nop
movdqa %xmm1,%xmm9
-.byte 15,56,203,202
+ sha256rnds2 %xmm2,%xmm1
movdqa 32-128(%rcx),%xmm0
paddd %xmm4,%xmm0
-.byte 102,15,56,0,239
-.byte 15,56,203,209
+ pshufb %xmm7,%xmm5
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
leaq 64(%rsi),%rsi
-.byte 15,56,204,220
-.byte 15,56,203,202
+ sha256msg1 %xmm4,%xmm3
+ sha256rnds2 %xmm2,%xmm1
movdqa 64-128(%rcx),%xmm0
paddd %xmm5,%xmm0
-.byte 102,15,56,0,247
-.byte 15,56,203,209
+ pshufb %xmm7,%xmm6
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm6,%xmm7
-.byte 102,15,58,15,253,4
+ palignr $4,%xmm5,%xmm7
nop
paddd %xmm7,%xmm3
-.byte 15,56,204,229
-.byte 15,56,203,202
+ sha256msg1 %xmm5,%xmm4
+ sha256rnds2 %xmm2,%xmm1
movdqa 96-128(%rcx),%xmm0
paddd %xmm6,%xmm0
-.byte 15,56,205,222
-.byte 15,56,203,209
+ sha256msg2 %xmm6,%xmm3
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm3,%xmm7
-.byte 102,15,58,15,254,4
+ palignr $4,%xmm6,%xmm7
nop
paddd %xmm7,%xmm4
-.byte 15,56,204,238
-.byte 15,56,203,202
+ sha256msg1 %xmm6,%xmm5
+ sha256rnds2 %xmm2,%xmm1
movdqa 128-128(%rcx),%xmm0
paddd %xmm3,%xmm0
-.byte 15,56,205,227
-.byte 15,56,203,209
+ sha256msg2 %xmm3,%xmm4
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm4,%xmm7
-.byte 102,15,58,15,251,4
+ palignr $4,%xmm3,%xmm7
nop
paddd %xmm7,%xmm5
-.byte 15,56,204,243
-.byte 15,56,203,202
+ sha256msg1 %xmm3,%xmm6
+ sha256rnds2 %xmm2,%xmm1
movdqa 160-128(%rcx),%xmm0
paddd %xmm4,%xmm0
-.byte 15,56,205,236
-.byte 15,56,203,209
+ sha256msg2 %xmm4,%xmm5
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm5,%xmm7
-.byte 102,15,58,15,252,4
+ palignr $4,%xmm4,%xmm7
nop
paddd %xmm7,%xmm6
-.byte 15,56,204,220
-.byte 15,56,203,202
+ sha256msg1 %xmm4,%xmm3
+ sha256rnds2 %xmm2,%xmm1
movdqa 192-128(%rcx),%xmm0
paddd %xmm5,%xmm0
-.byte 15,56,205,245
-.byte 15,56,203,209
+ sha256msg2 %xmm5,%xmm6
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm6,%xmm7
-.byte 102,15,58,15,253,4
+ palignr $4,%xmm5,%xmm7
nop
paddd %xmm7,%xmm3
-.byte 15,56,204,229
-.byte 15,56,203,202
+ sha256msg1 %xmm5,%xmm4
+ sha256rnds2 %xmm2,%xmm1
movdqa 224-128(%rcx),%xmm0
paddd %xmm6,%xmm0
-.byte 15,56,205,222
-.byte 15,56,203,209
+ sha256msg2 %xmm6,%xmm3
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm3,%xmm7
-.byte 102,15,58,15,254,4
+ palignr $4,%xmm6,%xmm7
nop
paddd %xmm7,%xmm4
-.byte 15,56,204,238
-.byte 15,56,203,202
+ sha256msg1 %xmm6,%xmm5
+ sha256rnds2 %xmm2,%xmm1
movdqa 256-128(%rcx),%xmm0
paddd %xmm3,%xmm0
-.byte 15,56,205,227
-.byte 15,56,203,209
+ sha256msg2 %xmm3,%xmm4
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm4,%xmm7
-.byte 102,15,58,15,251,4
+ palignr $4,%xmm3,%xmm7
nop
paddd %xmm7,%xmm5
-.byte 15,56,204,243
-.byte 15,56,203,202
+ sha256msg1 %xmm3,%xmm6
+ sha256rnds2 %xmm2,%xmm1
movdqa 288-128(%rcx),%xmm0
paddd %xmm4,%xmm0
-.byte 15,56,205,236
-.byte 15,56,203,209
+ sha256msg2 %xmm4,%xmm5
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm5,%xmm7
-.byte 102,15,58,15,252,4
+ palignr $4,%xmm4,%xmm7
nop
paddd %xmm7,%xmm6
-.byte 15,56,204,220
-.byte 15,56,203,202
+ sha256msg1 %xmm4,%xmm3
+ sha256rnds2 %xmm2,%xmm1
movdqa 320-128(%rcx),%xmm0
paddd %xmm5,%xmm0
-.byte 15,56,205,245
-.byte 15,56,203,209
+ sha256msg2 %xmm5,%xmm6
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm6,%xmm7
-.byte 102,15,58,15,253,4
+ palignr $4,%xmm5,%xmm7
nop
paddd %xmm7,%xmm3
-.byte 15,56,204,229
-.byte 15,56,203,202
+ sha256msg1 %xmm5,%xmm4
+ sha256rnds2 %xmm2,%xmm1
movdqa 352-128(%rcx),%xmm0
paddd %xmm6,%xmm0
-.byte 15,56,205,222
-.byte 15,56,203,209
+ sha256msg2 %xmm6,%xmm3
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm3,%xmm7
-.byte 102,15,58,15,254,4
+ palignr $4,%xmm6,%xmm7
nop
paddd %xmm7,%xmm4
-.byte 15,56,204,238
-.byte 15,56,203,202
+ sha256msg1 %xmm6,%xmm5
+ sha256rnds2 %xmm2,%xmm1
movdqa 384-128(%rcx),%xmm0
paddd %xmm3,%xmm0
-.byte 15,56,205,227
-.byte 15,56,203,209
+ sha256msg2 %xmm3,%xmm4
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm4,%xmm7
-.byte 102,15,58,15,251,4
+ palignr $4,%xmm3,%xmm7
nop
paddd %xmm7,%xmm5
-.byte 15,56,204,243
-.byte 15,56,203,202
+ sha256msg1 %xmm3,%xmm6
+ sha256rnds2 %xmm2,%xmm1
movdqa 416-128(%rcx),%xmm0
paddd %xmm4,%xmm0
-.byte 15,56,205,236
-.byte 15,56,203,209
+ sha256msg2 %xmm4,%xmm5
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm5,%xmm7
-.byte 102,15,58,15,252,4
-.byte 15,56,203,202
+ palignr $4,%xmm4,%xmm7
+ sha256rnds2 %xmm2,%xmm1
paddd %xmm7,%xmm6
movdqa 448-128(%rcx),%xmm0
paddd %xmm5,%xmm0
-.byte 15,56,203,209
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
-.byte 15,56,205,245
+ sha256msg2 %xmm5,%xmm6
movdqa %xmm8,%xmm7
-.byte 15,56,203,202
+ sha256rnds2 %xmm2,%xmm1
movdqa 480-128(%rcx),%xmm0
paddd %xmm6,%xmm0
nop
-.byte 15,56,203,209
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
decq %rdx
nop
-.byte 15,56,203,202
+ sha256rnds2 %xmm2,%xmm1
paddd %xmm10,%xmm2
paddd %xmm9,%xmm1
@@ -1968,7 +1968,7 @@
pshufd $0x1b,%xmm1,%xmm7
pshufd $0xb1,%xmm1,%xmm1
punpckhqdq %xmm2,%xmm1
-.byte 102,15,58,15,215,8
+ palignr $8,%xmm7,%xmm2
movdqu %xmm1,(%rdi)
movdqu %xmm2,16(%rdi)
@@ -2024,16 +2024,16 @@
movdqu 0(%rsi),%xmm0
movdqu 16(%rsi),%xmm1
movdqu 32(%rsi),%xmm2
-.byte 102,15,56,0,199
+ pshufb %xmm7,%xmm0
movdqu 48(%rsi),%xmm3
leaq K256(%rip),%rbp
-.byte 102,15,56,0,207
+ pshufb %xmm7,%xmm1
movdqa 0(%rbp),%xmm4
movdqa 32(%rbp),%xmm5
-.byte 102,15,56,0,215
+ pshufb %xmm7,%xmm2
paddd %xmm0,%xmm4
movdqa 64(%rbp),%xmm6
-.byte 102,15,56,0,223
+ pshufb %xmm7,%xmm3
movdqa 96(%rbp),%xmm7
paddd %xmm1,%xmm5
paddd %xmm2,%xmm6
@@ -2061,10 +2061,10 @@
xorl %r10d,%r12d
rorl $5,%r13d
xorl %eax,%r14d
-.byte 102,15,58,15,224,4
+ palignr $4,%xmm0,%xmm4
andl %r8d,%r12d
xorl %r8d,%r13d
-.byte 102,15,58,15,250,4
+ palignr $4,%xmm2,%xmm7
addl 0(%rsp),%r11d
movl %eax,%r15d
xorl %r10d,%r12d
@@ -2205,10 +2205,10 @@
xorl %ecx,%r12d
rorl $5,%r13d
xorl %r8d,%r14d
-.byte 102,15,58,15,225,4
+ palignr $4,%xmm1,%xmm4
andl %eax,%r12d
xorl %eax,%r13d
-.byte 102,15,58,15,251,4
+ palignr $4,%xmm3,%xmm7
addl 16(%rsp),%edx
movl %r8d,%r15d
xorl %ecx,%r12d
@@ -2349,10 +2349,10 @@
xorl %r10d,%r12d
rorl $5,%r13d
xorl %eax,%r14d
-.byte 102,15,58,15,226,4
+ palignr $4,%xmm2,%xmm4
andl %r8d,%r12d
xorl %r8d,%r13d
-.byte 102,15,58,15,248,4
+ palignr $4,%xmm0,%xmm7
addl 32(%rsp),%r11d
movl %eax,%r15d
xorl %r10d,%r12d
@@ -2493,10 +2493,10 @@
xorl %ecx,%r12d
rorl $5,%r13d
xorl %r8d,%r14d
-.byte 102,15,58,15,227,4
+ palignr $4,%xmm3,%xmm4
andl %eax,%r12d
xorl %eax,%r13d
-.byte 102,15,58,15,249,4
+ palignr $4,%xmm1,%xmm7
addl 48(%rsp),%edx
movl %r8d,%r15d
xorl %ecx,%r12d
diff --git a/gen/bcm/sha256-x86_64-linux.S b/gen/bcm/sha256-x86_64-linux.S
index 8476b03..938f531 100644
--- a/gen/bcm/sha256-x86_64-linux.S
+++ b/gen/bcm/sha256-x86_64-linux.S
@@ -1780,7 +1780,7 @@
pshufd $0xb1,%xmm1,%xmm1
pshufd $0x1b,%xmm2,%xmm2
movdqa %xmm7,%xmm8
-.byte 102,15,58,15,202,8
+ palignr $8,%xmm2,%xmm1
punpcklqdq %xmm0,%xmm2
jmp .Loop_shaext
@@ -1789,176 +1789,176 @@
movdqu (%rsi),%xmm3
movdqu 16(%rsi),%xmm4
movdqu 32(%rsi),%xmm5
-.byte 102,15,56,0,223
+ pshufb %xmm7,%xmm3
movdqu 48(%rsi),%xmm6
movdqa 0-128(%rcx),%xmm0
paddd %xmm3,%xmm0
-.byte 102,15,56,0,231
+ pshufb %xmm7,%xmm4
movdqa %xmm2,%xmm10
-.byte 15,56,203,209
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
nop
movdqa %xmm1,%xmm9
-.byte 15,56,203,202
+ sha256rnds2 %xmm2,%xmm1
movdqa 32-128(%rcx),%xmm0
paddd %xmm4,%xmm0
-.byte 102,15,56,0,239
-.byte 15,56,203,209
+ pshufb %xmm7,%xmm5
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
leaq 64(%rsi),%rsi
-.byte 15,56,204,220
-.byte 15,56,203,202
+ sha256msg1 %xmm4,%xmm3
+ sha256rnds2 %xmm2,%xmm1
movdqa 64-128(%rcx),%xmm0
paddd %xmm5,%xmm0
-.byte 102,15,56,0,247
-.byte 15,56,203,209
+ pshufb %xmm7,%xmm6
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm6,%xmm7
-.byte 102,15,58,15,253,4
+ palignr $4,%xmm5,%xmm7
nop
paddd %xmm7,%xmm3
-.byte 15,56,204,229
-.byte 15,56,203,202
+ sha256msg1 %xmm5,%xmm4
+ sha256rnds2 %xmm2,%xmm1
movdqa 96-128(%rcx),%xmm0
paddd %xmm6,%xmm0
-.byte 15,56,205,222
-.byte 15,56,203,209
+ sha256msg2 %xmm6,%xmm3
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm3,%xmm7
-.byte 102,15,58,15,254,4
+ palignr $4,%xmm6,%xmm7
nop
paddd %xmm7,%xmm4
-.byte 15,56,204,238
-.byte 15,56,203,202
+ sha256msg1 %xmm6,%xmm5
+ sha256rnds2 %xmm2,%xmm1
movdqa 128-128(%rcx),%xmm0
paddd %xmm3,%xmm0
-.byte 15,56,205,227
-.byte 15,56,203,209
+ sha256msg2 %xmm3,%xmm4
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm4,%xmm7
-.byte 102,15,58,15,251,4
+ palignr $4,%xmm3,%xmm7
nop
paddd %xmm7,%xmm5
-.byte 15,56,204,243
-.byte 15,56,203,202
+ sha256msg1 %xmm3,%xmm6
+ sha256rnds2 %xmm2,%xmm1
movdqa 160-128(%rcx),%xmm0
paddd %xmm4,%xmm0
-.byte 15,56,205,236
-.byte 15,56,203,209
+ sha256msg2 %xmm4,%xmm5
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm5,%xmm7
-.byte 102,15,58,15,252,4
+ palignr $4,%xmm4,%xmm7
nop
paddd %xmm7,%xmm6
-.byte 15,56,204,220
-.byte 15,56,203,202
+ sha256msg1 %xmm4,%xmm3
+ sha256rnds2 %xmm2,%xmm1
movdqa 192-128(%rcx),%xmm0
paddd %xmm5,%xmm0
-.byte 15,56,205,245
-.byte 15,56,203,209
+ sha256msg2 %xmm5,%xmm6
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm6,%xmm7
-.byte 102,15,58,15,253,4
+ palignr $4,%xmm5,%xmm7
nop
paddd %xmm7,%xmm3
-.byte 15,56,204,229
-.byte 15,56,203,202
+ sha256msg1 %xmm5,%xmm4
+ sha256rnds2 %xmm2,%xmm1
movdqa 224-128(%rcx),%xmm0
paddd %xmm6,%xmm0
-.byte 15,56,205,222
-.byte 15,56,203,209
+ sha256msg2 %xmm6,%xmm3
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm3,%xmm7
-.byte 102,15,58,15,254,4
+ palignr $4,%xmm6,%xmm7
nop
paddd %xmm7,%xmm4
-.byte 15,56,204,238
-.byte 15,56,203,202
+ sha256msg1 %xmm6,%xmm5
+ sha256rnds2 %xmm2,%xmm1
movdqa 256-128(%rcx),%xmm0
paddd %xmm3,%xmm0
-.byte 15,56,205,227
-.byte 15,56,203,209
+ sha256msg2 %xmm3,%xmm4
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm4,%xmm7
-.byte 102,15,58,15,251,4
+ palignr $4,%xmm3,%xmm7
nop
paddd %xmm7,%xmm5
-.byte 15,56,204,243
-.byte 15,56,203,202
+ sha256msg1 %xmm3,%xmm6
+ sha256rnds2 %xmm2,%xmm1
movdqa 288-128(%rcx),%xmm0
paddd %xmm4,%xmm0
-.byte 15,56,205,236
-.byte 15,56,203,209
+ sha256msg2 %xmm4,%xmm5
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm5,%xmm7
-.byte 102,15,58,15,252,4
+ palignr $4,%xmm4,%xmm7
nop
paddd %xmm7,%xmm6
-.byte 15,56,204,220
-.byte 15,56,203,202
+ sha256msg1 %xmm4,%xmm3
+ sha256rnds2 %xmm2,%xmm1
movdqa 320-128(%rcx),%xmm0
paddd %xmm5,%xmm0
-.byte 15,56,205,245
-.byte 15,56,203,209
+ sha256msg2 %xmm5,%xmm6
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm6,%xmm7
-.byte 102,15,58,15,253,4
+ palignr $4,%xmm5,%xmm7
nop
paddd %xmm7,%xmm3
-.byte 15,56,204,229
-.byte 15,56,203,202
+ sha256msg1 %xmm5,%xmm4
+ sha256rnds2 %xmm2,%xmm1
movdqa 352-128(%rcx),%xmm0
paddd %xmm6,%xmm0
-.byte 15,56,205,222
-.byte 15,56,203,209
+ sha256msg2 %xmm6,%xmm3
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm3,%xmm7
-.byte 102,15,58,15,254,4
+ palignr $4,%xmm6,%xmm7
nop
paddd %xmm7,%xmm4
-.byte 15,56,204,238
-.byte 15,56,203,202
+ sha256msg1 %xmm6,%xmm5
+ sha256rnds2 %xmm2,%xmm1
movdqa 384-128(%rcx),%xmm0
paddd %xmm3,%xmm0
-.byte 15,56,205,227
-.byte 15,56,203,209
+ sha256msg2 %xmm3,%xmm4
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm4,%xmm7
-.byte 102,15,58,15,251,4
+ palignr $4,%xmm3,%xmm7
nop
paddd %xmm7,%xmm5
-.byte 15,56,204,243
-.byte 15,56,203,202
+ sha256msg1 %xmm3,%xmm6
+ sha256rnds2 %xmm2,%xmm1
movdqa 416-128(%rcx),%xmm0
paddd %xmm4,%xmm0
-.byte 15,56,205,236
-.byte 15,56,203,209
+ sha256msg2 %xmm4,%xmm5
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm5,%xmm7
-.byte 102,15,58,15,252,4
-.byte 15,56,203,202
+ palignr $4,%xmm4,%xmm7
+ sha256rnds2 %xmm2,%xmm1
paddd %xmm7,%xmm6
movdqa 448-128(%rcx),%xmm0
paddd %xmm5,%xmm0
-.byte 15,56,203,209
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
-.byte 15,56,205,245
+ sha256msg2 %xmm5,%xmm6
movdqa %xmm8,%xmm7
-.byte 15,56,203,202
+ sha256rnds2 %xmm2,%xmm1
movdqa 480-128(%rcx),%xmm0
paddd %xmm6,%xmm0
nop
-.byte 15,56,203,209
+ sha256rnds2 %xmm1,%xmm2
pshufd $0x0e,%xmm0,%xmm0
decq %rdx
nop
-.byte 15,56,203,202
+ sha256rnds2 %xmm2,%xmm1
paddd %xmm10,%xmm2
paddd %xmm9,%xmm1
@@ -1968,7 +1968,7 @@
pshufd $0x1b,%xmm1,%xmm7
pshufd $0xb1,%xmm1,%xmm1
punpckhqdq %xmm2,%xmm1
-.byte 102,15,58,15,215,8
+ palignr $8,%xmm7,%xmm2
movdqu %xmm1,(%rdi)
movdqu %xmm2,16(%rdi)
@@ -2024,16 +2024,16 @@
movdqu 0(%rsi),%xmm0
movdqu 16(%rsi),%xmm1
movdqu 32(%rsi),%xmm2
-.byte 102,15,56,0,199
+ pshufb %xmm7,%xmm0
movdqu 48(%rsi),%xmm3
leaq K256(%rip),%rbp
-.byte 102,15,56,0,207
+ pshufb %xmm7,%xmm1
movdqa 0(%rbp),%xmm4
movdqa 32(%rbp),%xmm5
-.byte 102,15,56,0,215
+ pshufb %xmm7,%xmm2
paddd %xmm0,%xmm4
movdqa 64(%rbp),%xmm6
-.byte 102,15,56,0,223
+ pshufb %xmm7,%xmm3
movdqa 96(%rbp),%xmm7
paddd %xmm1,%xmm5
paddd %xmm2,%xmm6
@@ -2061,10 +2061,10 @@
xorl %r10d,%r12d
rorl $5,%r13d
xorl %eax,%r14d
-.byte 102,15,58,15,224,4
+ palignr $4,%xmm0,%xmm4
andl %r8d,%r12d
xorl %r8d,%r13d
-.byte 102,15,58,15,250,4
+ palignr $4,%xmm2,%xmm7
addl 0(%rsp),%r11d
movl %eax,%r15d
xorl %r10d,%r12d
@@ -2205,10 +2205,10 @@
xorl %ecx,%r12d
rorl $5,%r13d
xorl %r8d,%r14d
-.byte 102,15,58,15,225,4
+ palignr $4,%xmm1,%xmm4
andl %eax,%r12d
xorl %eax,%r13d
-.byte 102,15,58,15,251,4
+ palignr $4,%xmm3,%xmm7
addl 16(%rsp),%edx
movl %r8d,%r15d
xorl %ecx,%r12d
@@ -2349,10 +2349,10 @@
xorl %r10d,%r12d
rorl $5,%r13d
xorl %eax,%r14d
-.byte 102,15,58,15,226,4
+ palignr $4,%xmm2,%xmm4
andl %r8d,%r12d
xorl %r8d,%r13d
-.byte 102,15,58,15,248,4
+ palignr $4,%xmm0,%xmm7
addl 32(%rsp),%r11d
movl %eax,%r15d
xorl %r10d,%r12d
@@ -2493,10 +2493,10 @@
xorl %ecx,%r12d
rorl $5,%r13d
xorl %r8d,%r14d
-.byte 102,15,58,15,227,4
+ palignr $4,%xmm3,%xmm4
andl %eax,%r12d
xorl %eax,%r13d
-.byte 102,15,58,15,249,4
+ palignr $4,%xmm1,%xmm7
addl 48(%rsp),%edx
movl %r8d,%r15d
xorl %ecx,%r12d
diff --git a/gen/bcm/sha256-x86_64-win.asm b/gen/bcm/sha256-x86_64-win.asm
index ada8dba..b720603 100644
--- a/gen/bcm/sha256-x86_64-win.asm
+++ b/gen/bcm/sha256-x86_64-win.asm
@@ -1818,7 +1818,7 @@
pshufd xmm1,xmm1,0xb1
pshufd xmm2,xmm2,0x1b
movdqa xmm8,xmm7
-DB 102,15,58,15,202,8
+ palignr xmm1,xmm2,8
punpcklqdq xmm2,xmm0
jmp NEAR $L$oop_shaext
@@ -1827,176 +1827,176 @@
movdqu xmm3,XMMWORD[rsi]
movdqu xmm4,XMMWORD[16+rsi]
movdqu xmm5,XMMWORD[32+rsi]
-DB 102,15,56,0,223
+ pshufb xmm3,xmm7
movdqu xmm6,XMMWORD[48+rsi]
movdqa xmm0,XMMWORD[((0-128))+rcx]
paddd xmm0,xmm3
-DB 102,15,56,0,231
+ pshufb xmm4,xmm7
movdqa xmm10,xmm2
- DB 15,56,203,209
+ sha256rnds2 xmm2,xmm1
pshufd xmm0,xmm0,0x0e
nop
movdqa xmm9,xmm1
- DB 15,56,203,202
+ sha256rnds2 xmm1,xmm2
movdqa xmm0,XMMWORD[((32-128))+rcx]
paddd xmm0,xmm4
-DB 102,15,56,0,239
- DB 15,56,203,209
+ pshufb xmm5,xmm7
+ sha256rnds2 xmm2,xmm1
pshufd xmm0,xmm0,0x0e
lea rsi,[64+rsi]
- DB 15,56,204,220
- DB 15,56,203,202
+ sha256msg1 xmm3,xmm4
+ sha256rnds2 xmm1,xmm2
movdqa xmm0,XMMWORD[((64-128))+rcx]
paddd xmm0,xmm5
-DB 102,15,56,0,247
- DB 15,56,203,209
+ pshufb xmm6,xmm7
+ sha256rnds2 xmm2,xmm1
pshufd xmm0,xmm0,0x0e
movdqa xmm7,xmm6
-DB 102,15,58,15,253,4
+ palignr xmm7,xmm5,4
nop
paddd xmm3,xmm7
- DB 15,56,204,229
- DB 15,56,203,202
+ sha256msg1 xmm4,xmm5
+ sha256rnds2 xmm1,xmm2
movdqa xmm0,XMMWORD[((96-128))+rcx]
paddd xmm0,xmm6
- DB 15,56,205,222
- DB 15,56,203,209
+ sha256msg2 xmm3,xmm6
+ sha256rnds2 xmm2,xmm1
pshufd xmm0,xmm0,0x0e
movdqa xmm7,xmm3
-DB 102,15,58,15,254,4
+ palignr xmm7,xmm6,4
nop
paddd xmm4,xmm7
- DB 15,56,204,238
- DB 15,56,203,202
+ sha256msg1 xmm5,xmm6
+ sha256rnds2 xmm1,xmm2
movdqa xmm0,XMMWORD[((128-128))+rcx]
paddd xmm0,xmm3
- DB 15,56,205,227
- DB 15,56,203,209
+ sha256msg2 xmm4,xmm3
+ sha256rnds2 xmm2,xmm1
pshufd xmm0,xmm0,0x0e
movdqa xmm7,xmm4
-DB 102,15,58,15,251,4
+ palignr xmm7,xmm3,4
nop
paddd xmm5,xmm7
- DB 15,56,204,243
- DB 15,56,203,202
+ sha256msg1 xmm6,xmm3
+ sha256rnds2 xmm1,xmm2
movdqa xmm0,XMMWORD[((160-128))+rcx]
paddd xmm0,xmm4
- DB 15,56,205,236
- DB 15,56,203,209
+ sha256msg2 xmm5,xmm4
+ sha256rnds2 xmm2,xmm1
pshufd xmm0,xmm0,0x0e
movdqa xmm7,xmm5
-DB 102,15,58,15,252,4
+ palignr xmm7,xmm4,4
nop
paddd xmm6,xmm7
- DB 15,56,204,220
- DB 15,56,203,202
+ sha256msg1 xmm3,xmm4
+ sha256rnds2 xmm1,xmm2
movdqa xmm0,XMMWORD[((192-128))+rcx]
paddd xmm0,xmm5
- DB 15,56,205,245
- DB 15,56,203,209
+ sha256msg2 xmm6,xmm5
+ sha256rnds2 xmm2,xmm1
pshufd xmm0,xmm0,0x0e
movdqa xmm7,xmm6
-DB 102,15,58,15,253,4
+ palignr xmm7,xmm5,4
nop
paddd xmm3,xmm7
- DB 15,56,204,229
- DB 15,56,203,202
+ sha256msg1 xmm4,xmm5
+ sha256rnds2 xmm1,xmm2
movdqa xmm0,XMMWORD[((224-128))+rcx]
paddd xmm0,xmm6
- DB 15,56,205,222
- DB 15,56,203,209
+ sha256msg2 xmm3,xmm6
+ sha256rnds2 xmm2,xmm1
pshufd xmm0,xmm0,0x0e
movdqa xmm7,xmm3
-DB 102,15,58,15,254,4
+ palignr xmm7,xmm6,4
nop
paddd xmm4,xmm7
- DB 15,56,204,238
- DB 15,56,203,202
+ sha256msg1 xmm5,xmm6
+ sha256rnds2 xmm1,xmm2
movdqa xmm0,XMMWORD[((256-128))+rcx]
paddd xmm0,xmm3
- DB 15,56,205,227
- DB 15,56,203,209
+ sha256msg2 xmm4,xmm3
+ sha256rnds2 xmm2,xmm1
pshufd xmm0,xmm0,0x0e
movdqa xmm7,xmm4
-DB 102,15,58,15,251,4
+ palignr xmm7,xmm3,4
nop
paddd xmm5,xmm7
- DB 15,56,204,243
- DB 15,56,203,202
+ sha256msg1 xmm6,xmm3
+ sha256rnds2 xmm1,xmm2
movdqa xmm0,XMMWORD[((288-128))+rcx]
paddd xmm0,xmm4
- DB 15,56,205,236
- DB 15,56,203,209
+ sha256msg2 xmm5,xmm4
+ sha256rnds2 xmm2,xmm1
pshufd xmm0,xmm0,0x0e
movdqa xmm7,xmm5
-DB 102,15,58,15,252,4
+ palignr xmm7,xmm4,4
nop
paddd xmm6,xmm7
- DB 15,56,204,220
- DB 15,56,203,202
+ sha256msg1 xmm3,xmm4
+ sha256rnds2 xmm1,xmm2
movdqa xmm0,XMMWORD[((320-128))+rcx]
paddd xmm0,xmm5
- DB 15,56,205,245
- DB 15,56,203,209
+ sha256msg2 xmm6,xmm5
+ sha256rnds2 xmm2,xmm1
pshufd xmm0,xmm0,0x0e
movdqa xmm7,xmm6
-DB 102,15,58,15,253,4
+ palignr xmm7,xmm5,4
nop
paddd xmm3,xmm7
- DB 15,56,204,229
- DB 15,56,203,202
+ sha256msg1 xmm4,xmm5
+ sha256rnds2 xmm1,xmm2
movdqa xmm0,XMMWORD[((352-128))+rcx]
paddd xmm0,xmm6
- DB 15,56,205,222
- DB 15,56,203,209
+ sha256msg2 xmm3,xmm6
+ sha256rnds2 xmm2,xmm1
pshufd xmm0,xmm0,0x0e
movdqa xmm7,xmm3
-DB 102,15,58,15,254,4
+ palignr xmm7,xmm6,4
nop
paddd xmm4,xmm7
- DB 15,56,204,238
- DB 15,56,203,202
+ sha256msg1 xmm5,xmm6
+ sha256rnds2 xmm1,xmm2
movdqa xmm0,XMMWORD[((384-128))+rcx]
paddd xmm0,xmm3
- DB 15,56,205,227
- DB 15,56,203,209
+ sha256msg2 xmm4,xmm3
+ sha256rnds2 xmm2,xmm1
pshufd xmm0,xmm0,0x0e
movdqa xmm7,xmm4
-DB 102,15,58,15,251,4
+ palignr xmm7,xmm3,4
nop
paddd xmm5,xmm7
- DB 15,56,204,243
- DB 15,56,203,202
+ sha256msg1 xmm6,xmm3
+ sha256rnds2 xmm1,xmm2
movdqa xmm0,XMMWORD[((416-128))+rcx]
paddd xmm0,xmm4
- DB 15,56,205,236
- DB 15,56,203,209
+ sha256msg2 xmm5,xmm4
+ sha256rnds2 xmm2,xmm1
pshufd xmm0,xmm0,0x0e
movdqa xmm7,xmm5
-DB 102,15,58,15,252,4
- DB 15,56,203,202
+ palignr xmm7,xmm4,4
+ sha256rnds2 xmm1,xmm2
paddd xmm6,xmm7
movdqa xmm0,XMMWORD[((448-128))+rcx]
paddd xmm0,xmm5
- DB 15,56,203,209
+ sha256rnds2 xmm2,xmm1
pshufd xmm0,xmm0,0x0e
- DB 15,56,205,245
+ sha256msg2 xmm6,xmm5
movdqa xmm7,xmm8
- DB 15,56,203,202
+ sha256rnds2 xmm1,xmm2
movdqa xmm0,XMMWORD[((480-128))+rcx]
paddd xmm0,xmm6
nop
- DB 15,56,203,209
+ sha256rnds2 xmm2,xmm1
pshufd xmm0,xmm0,0x0e
dec rdx
nop
- DB 15,56,203,202
+ sha256rnds2 xmm1,xmm2
paddd xmm2,xmm10
paddd xmm1,xmm9
@@ -2006,7 +2006,7 @@
pshufd xmm7,xmm1,0x1b
pshufd xmm1,xmm1,0xb1
punpckhqdq xmm1,xmm2
-DB 102,15,58,15,215,8
+ palignr xmm2,xmm7,8
movdqu XMMWORD[rdi],xmm1
movdqu XMMWORD[16+rdi],xmm2
@@ -2083,16 +2083,16 @@
movdqu xmm0,XMMWORD[rsi]
movdqu xmm1,XMMWORD[16+rsi]
movdqu xmm2,XMMWORD[32+rsi]
-DB 102,15,56,0,199
+ pshufb xmm0,xmm7
movdqu xmm3,XMMWORD[48+rsi]
lea rbp,[K256]
-DB 102,15,56,0,207
+ pshufb xmm1,xmm7
movdqa xmm4,XMMWORD[rbp]
movdqa xmm5,XMMWORD[32+rbp]
-DB 102,15,56,0,215
+ pshufb xmm2,xmm7
paddd xmm4,xmm0
movdqa xmm6,XMMWORD[64+rbp]
-DB 102,15,56,0,223
+ pshufb xmm3,xmm7
movdqa xmm7,XMMWORD[96+rbp]
paddd xmm5,xmm1
paddd xmm6,xmm2
@@ -2120,10 +2120,10 @@
xor r12d,r10d
ror r13d,5
xor r14d,eax
-DB 102,15,58,15,224,4
+ palignr xmm4,xmm0,4
and r12d,r8d
xor r13d,r8d
-DB 102,15,58,15,250,4
+ palignr xmm7,xmm2,4
add r11d,DWORD[rsp]
mov r15d,eax
xor r12d,r10d
@@ -2264,10 +2264,10 @@
xor r12d,ecx
ror r13d,5
xor r14d,r8d
-DB 102,15,58,15,225,4
+ palignr xmm4,xmm1,4
and r12d,eax
xor r13d,eax
-DB 102,15,58,15,251,4
+ palignr xmm7,xmm3,4
add edx,DWORD[16+rsp]
mov r15d,r8d
xor r12d,ecx
@@ -2408,10 +2408,10 @@
xor r12d,r10d
ror r13d,5
xor r14d,eax
-DB 102,15,58,15,226,4
+ palignr xmm4,xmm2,4
and r12d,r8d
xor r13d,r8d
-DB 102,15,58,15,248,4
+ palignr xmm7,xmm0,4
add r11d,DWORD[32+rsp]
mov r15d,eax
xor r12d,r10d
@@ -2552,10 +2552,10 @@
xor r12d,ecx
ror r13d,5
xor r14d,r8d
-DB 102,15,58,15,227,4
+ palignr xmm4,xmm3,4
and r12d,eax
xor r13d,eax
-DB 102,15,58,15,249,4
+ palignr xmm7,xmm1,4
add edx,DWORD[48+rsp]
mov r15d,r8d
xor r12d,ecx
diff --git a/gen/bcm/vpaes-x86_64-apple.S b/gen/bcm/vpaes-x86_64-apple.S
index 5aea40f..bfcc030 100644
--- a/gen/bcm/vpaes-x86_64-apple.S
+++ b/gen/bcm/vpaes-x86_64-apple.S
@@ -34,9 +34,9 @@
movdqu (%r9),%xmm5
psrld $4,%xmm1
pand %xmm9,%xmm0
-.byte 102,15,56,0,208
+ pshufb %xmm0,%xmm2
movdqa L$k_ipt+16(%rip),%xmm0
-.byte 102,15,56,0,193
+ pshufb %xmm1,%xmm0
pxor %xmm5,%xmm2
addq $16,%r9
pxor %xmm2,%xmm0
@@ -48,25 +48,25 @@
movdqa %xmm13,%xmm4
movdqa %xmm12,%xmm0
-.byte 102,15,56,0,226
-.byte 102,15,56,0,195
+ pshufb %xmm2,%xmm4
+ pshufb %xmm3,%xmm0
pxor %xmm5,%xmm4
movdqa %xmm15,%xmm5
pxor %xmm4,%xmm0
movdqa -64(%r11,%r10,1),%xmm1
-.byte 102,15,56,0,234
+ pshufb %xmm2,%xmm5
movdqa (%r11,%r10,1),%xmm4
movdqa %xmm14,%xmm2
-.byte 102,15,56,0,211
+ pshufb %xmm3,%xmm2
movdqa %xmm0,%xmm3
pxor %xmm5,%xmm2
-.byte 102,15,56,0,193
+ pshufb %xmm1,%xmm0
addq $16,%r9
pxor %xmm2,%xmm0
-.byte 102,15,56,0,220
+ pshufb %xmm4,%xmm3
addq $16,%r11
pxor %xmm0,%xmm3
-.byte 102,15,56,0,193
+ pshufb %xmm1,%xmm0
andq $0x30,%r11
subq $1,%rax
pxor %xmm3,%xmm0
@@ -78,19 +78,19 @@
pandn %xmm0,%xmm1
psrld $4,%xmm1
pand %xmm9,%xmm0
-.byte 102,15,56,0,232
+ pshufb %xmm0,%xmm5
movdqa %xmm10,%xmm3
pxor %xmm1,%xmm0
-.byte 102,15,56,0,217
+ pshufb %xmm1,%xmm3
movdqa %xmm10,%xmm4
pxor %xmm5,%xmm3
-.byte 102,15,56,0,224
+ pshufb %xmm0,%xmm4
movdqa %xmm10,%xmm2
pxor %xmm5,%xmm4
-.byte 102,15,56,0,211
+ pshufb %xmm3,%xmm2
movdqa %xmm10,%xmm3
pxor %xmm0,%xmm2
-.byte 102,15,56,0,220
+ pshufb %xmm4,%xmm3
movdqu (%r9),%xmm5
pxor %xmm1,%xmm3
jnz L$enc_loop
@@ -98,12 +98,12 @@
movdqa -96(%r10),%xmm4
movdqa -80(%r10),%xmm0
-.byte 102,15,56,0,226
+ pshufb %xmm2,%xmm4
pxor %xmm5,%xmm4
-.byte 102,15,56,0,195
+ pshufb %xmm3,%xmm0
movdqa 64(%r11,%r10,1),%xmm1
pxor %xmm4,%xmm0
-.byte 102,15,56,0,193
+ pshufb %xmm1,%xmm0
ret
@@ -156,12 +156,12 @@
psrld $4,%xmm7
pand %xmm9,%xmm0
pand %xmm9,%xmm6
-.byte 102,15,56,0,208
-.byte 102,68,15,56,0,198
+ pshufb %xmm0,%xmm2
+ pshufb %xmm6,%xmm8
movdqa L$k_ipt+16(%rip),%xmm0
movdqa %xmm0,%xmm6
-.byte 102,15,56,0,193
-.byte 102,15,56,0,247
+ pshufb %xmm1,%xmm0
+ pshufb %xmm7,%xmm6
pxor %xmm5,%xmm2
pxor %xmm5,%xmm8
addq $16,%r9
@@ -177,10 +177,10 @@
movdqa L$k_sb1+16(%rip),%xmm0
movdqa %xmm4,%xmm12
movdqa %xmm0,%xmm6
-.byte 102,15,56,0,226
-.byte 102,69,15,56,0,224
-.byte 102,15,56,0,195
-.byte 102,65,15,56,0,243
+ pshufb %xmm2,%xmm4
+ pshufb %xmm8,%xmm12
+ pshufb %xmm3,%xmm0
+ pshufb %xmm11,%xmm6
pxor %xmm5,%xmm4
pxor %xmm5,%xmm12
movdqa L$k_sb2(%rip),%xmm5
@@ -189,30 +189,30 @@
pxor %xmm12,%xmm6
movdqa -64(%r11,%r10,1),%xmm1
-.byte 102,15,56,0,234
-.byte 102,69,15,56,0,232
+ pshufb %xmm2,%xmm5
+ pshufb %xmm8,%xmm13
movdqa (%r11,%r10,1),%xmm4
movdqa L$k_sb2+16(%rip),%xmm2
movdqa %xmm2,%xmm8
-.byte 102,15,56,0,211
-.byte 102,69,15,56,0,195
+ pshufb %xmm3,%xmm2
+ pshufb %xmm11,%xmm8
movdqa %xmm0,%xmm3
movdqa %xmm6,%xmm11
pxor %xmm5,%xmm2
pxor %xmm13,%xmm8
-.byte 102,15,56,0,193
-.byte 102,15,56,0,241
+ pshufb %xmm1,%xmm0
+ pshufb %xmm1,%xmm6
addq $16,%r9
pxor %xmm2,%xmm0
pxor %xmm8,%xmm6
-.byte 102,15,56,0,220
-.byte 102,68,15,56,0,220
+ pshufb %xmm4,%xmm3
+ pshufb %xmm4,%xmm11
addq $16,%r11
pxor %xmm0,%xmm3
pxor %xmm6,%xmm11
-.byte 102,15,56,0,193
-.byte 102,15,56,0,241
+ pshufb %xmm1,%xmm0
+ pshufb %xmm1,%xmm6
andq $0x30,%r11
subq $1,%rax
pxor %xmm3,%xmm0
@@ -230,32 +230,32 @@
psrld $4,%xmm7
pand %xmm9,%xmm0
pand %xmm9,%xmm6
-.byte 102,15,56,0,232
-.byte 102,68,15,56,0,238
+ pshufb %xmm0,%xmm5
+ pshufb %xmm6,%xmm13
movdqa %xmm10,%xmm3
movdqa %xmm10,%xmm11
pxor %xmm1,%xmm0
pxor %xmm7,%xmm6
-.byte 102,15,56,0,217
-.byte 102,68,15,56,0,223
+ pshufb %xmm1,%xmm3
+ pshufb %xmm7,%xmm11
movdqa %xmm10,%xmm4
movdqa %xmm10,%xmm12
pxor %xmm5,%xmm3
pxor %xmm13,%xmm11
-.byte 102,15,56,0,224
-.byte 102,68,15,56,0,230
+ pshufb %xmm0,%xmm4
+ pshufb %xmm6,%xmm12
movdqa %xmm10,%xmm2
movdqa %xmm10,%xmm8
pxor %xmm5,%xmm4
pxor %xmm13,%xmm12
-.byte 102,15,56,0,211
-.byte 102,69,15,56,0,195
+ pshufb %xmm3,%xmm2
+ pshufb %xmm11,%xmm8
movdqa %xmm10,%xmm3
movdqa %xmm10,%xmm11
pxor %xmm0,%xmm2
pxor %xmm6,%xmm8
-.byte 102,15,56,0,220
-.byte 102,69,15,56,0,220
+ pshufb %xmm4,%xmm3
+ pshufb %xmm12,%xmm11
movdqu (%r9),%xmm5
pxor %xmm1,%xmm3
@@ -267,18 +267,18 @@
movdqa -80(%r10),%xmm0
movdqa %xmm4,%xmm12
movdqa %xmm0,%xmm6
-.byte 102,15,56,0,226
-.byte 102,69,15,56,0,224
+ pshufb %xmm2,%xmm4
+ pshufb %xmm8,%xmm12
pxor %xmm5,%xmm4
pxor %xmm5,%xmm12
-.byte 102,15,56,0,195
-.byte 102,65,15,56,0,243
+ pshufb %xmm3,%xmm0
+ pshufb %xmm11,%xmm6
movdqa 64(%r11,%r10,1),%xmm1
pxor %xmm4,%xmm0
pxor %xmm12,%xmm6
-.byte 102,15,56,0,193
-.byte 102,15,56,0,241
+ pshufb %xmm1,%xmm0
+ pshufb %xmm1,%xmm6
ret
@@ -302,11 +302,11 @@
movdqu (%r9),%xmm5
shlq $4,%r11
pand %xmm9,%xmm0
-.byte 102,15,56,0,208
+ pshufb %xmm0,%xmm2
movdqa L$k_dipt+16(%rip),%xmm0
xorq $0x30,%r11
leaq L$k_dsbd(%rip),%r10
-.byte 102,15,56,0,193
+ pshufb %xmm1,%xmm0
andq $0x30,%r11
pxor %xmm5,%xmm2
movdqa L$k_mc_forward+48(%rip),%xmm5
@@ -322,35 +322,35 @@
movdqa -32(%r10),%xmm4
movdqa -16(%r10),%xmm1
-.byte 102,15,56,0,226
-.byte 102,15,56,0,203
+ pshufb %xmm2,%xmm4
+ pshufb %xmm3,%xmm1
pxor %xmm4,%xmm0
movdqa 0(%r10),%xmm4
pxor %xmm1,%xmm0
movdqa 16(%r10),%xmm1
-.byte 102,15,56,0,226
-.byte 102,15,56,0,197
-.byte 102,15,56,0,203
+ pshufb %xmm2,%xmm4
+ pshufb %xmm5,%xmm0
+ pshufb %xmm3,%xmm1
pxor %xmm4,%xmm0
movdqa 32(%r10),%xmm4
pxor %xmm1,%xmm0
movdqa 48(%r10),%xmm1
-.byte 102,15,56,0,226
-.byte 102,15,56,0,197
-.byte 102,15,56,0,203
+ pshufb %xmm2,%xmm4
+ pshufb %xmm5,%xmm0
+ pshufb %xmm3,%xmm1
pxor %xmm4,%xmm0
movdqa 64(%r10),%xmm4
pxor %xmm1,%xmm0
movdqa 80(%r10),%xmm1
-.byte 102,15,56,0,226
-.byte 102,15,56,0,197
-.byte 102,15,56,0,203
+ pshufb %xmm2,%xmm4
+ pshufb %xmm5,%xmm0
+ pshufb %xmm3,%xmm1
pxor %xmm4,%xmm0
addq $16,%r9
-.byte 102,15,58,15,237,12
+ palignr $12,%xmm5,%xmm5
pxor %xmm1,%xmm0
subq $1,%rax
@@ -361,32 +361,32 @@
movdqa %xmm11,%xmm2
psrld $4,%xmm1
pand %xmm9,%xmm0
-.byte 102,15,56,0,208
+ pshufb %xmm0,%xmm2
movdqa %xmm10,%xmm3
pxor %xmm1,%xmm0
-.byte 102,15,56,0,217
+ pshufb %xmm1,%xmm3
movdqa %xmm10,%xmm4
pxor %xmm2,%xmm3
-.byte 102,15,56,0,224
+ pshufb %xmm0,%xmm4
pxor %xmm2,%xmm4
movdqa %xmm10,%xmm2
-.byte 102,15,56,0,211
+ pshufb %xmm3,%xmm2
movdqa %xmm10,%xmm3
pxor %xmm0,%xmm2
-.byte 102,15,56,0,220
+ pshufb %xmm4,%xmm3
movdqu (%r9),%xmm0
pxor %xmm1,%xmm3
jnz L$dec_loop
movdqa 96(%r10),%xmm4
-.byte 102,15,56,0,226
+ pshufb %xmm2,%xmm4
pxor %xmm0,%xmm4
movdqa 112(%r10),%xmm0
movdqa -352(%r11),%xmm2
-.byte 102,15,56,0,195
+ pshufb %xmm3,%xmm0
pxor %xmm4,%xmm0
-.byte 102,15,56,0,194
+ pshufb %xmm2,%xmm0
ret
@@ -426,7 +426,7 @@
L$schedule_am_decrypting:
movdqa (%r8,%r10,1),%xmm1
-.byte 102,15,56,0,217
+ pshufb %xmm1,%xmm3
movdqu %xmm3,(%rdx)
xorq $0x30,%r8
@@ -480,7 +480,7 @@
L$oop_schedule_192:
call _vpaes_schedule_round
-.byte 102,15,58,15,198,8
+ palignr $8,%xmm6,%xmm0
call _vpaes_schedule_mangle
call _vpaes_schedule_192_smear
call _vpaes_schedule_mangle
@@ -546,7 +546,7 @@
movdqa (%r8,%r10,1),%xmm1
-.byte 102,15,56,0,193
+ pshufb %xmm1,%xmm0
leaq L$k_opt(%rip),%r11
addq $32,%rdx
@@ -622,13 +622,13 @@
pxor %xmm1,%xmm1
-.byte 102,65,15,58,15,200,15
-.byte 102,69,15,58,15,192,15
+ palignr $15,%xmm8,%xmm1
+ palignr $15,%xmm8,%xmm8
pxor %xmm1,%xmm7
pshufd $0xFF,%xmm0,%xmm0
-.byte 102,15,58,15,192,1
+ palignr $1,%xmm0,%xmm0
@@ -649,24 +649,24 @@
psrld $4,%xmm1
pand %xmm9,%xmm0
movdqa %xmm11,%xmm2
-.byte 102,15,56,0,208
+ pshufb %xmm0,%xmm2
pxor %xmm1,%xmm0
movdqa %xmm10,%xmm3
-.byte 102,15,56,0,217
+ pshufb %xmm1,%xmm3
pxor %xmm2,%xmm3
movdqa %xmm10,%xmm4
-.byte 102,15,56,0,224
+ pshufb %xmm0,%xmm4
pxor %xmm2,%xmm4
movdqa %xmm10,%xmm2
-.byte 102,15,56,0,211
+ pshufb %xmm3,%xmm2
pxor %xmm0,%xmm2
movdqa %xmm10,%xmm3
-.byte 102,15,56,0,220
+ pshufb %xmm4,%xmm3
pxor %xmm1,%xmm3
movdqa %xmm13,%xmm4
-.byte 102,15,56,0,226
+ pshufb %xmm2,%xmm4
movdqa %xmm12,%xmm0
-.byte 102,15,56,0,195
+ pshufb %xmm3,%xmm0
pxor %xmm4,%xmm0
@@ -694,9 +694,9 @@
psrld $4,%xmm1
pand %xmm9,%xmm0
movdqa (%r11),%xmm2
-.byte 102,15,56,0,208
+ pshufb %xmm0,%xmm2
movdqa 16(%r11),%xmm0
-.byte 102,15,56,0,193
+ pshufb %xmm1,%xmm0
pxor %xmm2,%xmm0
ret
@@ -737,11 +737,11 @@
addq $16,%rdx
pxor L$k_s63(%rip),%xmm4
-.byte 102,15,56,0,229
+ pshufb %xmm5,%xmm4
movdqa %xmm4,%xmm3
-.byte 102,15,56,0,229
+ pshufb %xmm5,%xmm4
pxor %xmm4,%xmm3
-.byte 102,15,56,0,229
+ pshufb %xmm5,%xmm4
pxor %xmm4,%xmm3
jmp L$schedule_mangle_both
@@ -755,40 +755,40 @@
pand %xmm9,%xmm4
movdqa 0(%r11),%xmm2
-.byte 102,15,56,0,212
+ pshufb %xmm4,%xmm2
movdqa 16(%r11),%xmm3
-.byte 102,15,56,0,217
+ pshufb %xmm1,%xmm3
pxor %xmm2,%xmm3
-.byte 102,15,56,0,221
+ pshufb %xmm5,%xmm3
movdqa 32(%r11),%xmm2
-.byte 102,15,56,0,212
+ pshufb %xmm4,%xmm2
pxor %xmm3,%xmm2
movdqa 48(%r11),%xmm3
-.byte 102,15,56,0,217
+ pshufb %xmm1,%xmm3
pxor %xmm2,%xmm3
-.byte 102,15,56,0,221
+ pshufb %xmm5,%xmm3
movdqa 64(%r11),%xmm2
-.byte 102,15,56,0,212
+ pshufb %xmm4,%xmm2
pxor %xmm3,%xmm2
movdqa 80(%r11),%xmm3
-.byte 102,15,56,0,217
+ pshufb %xmm1,%xmm3
pxor %xmm2,%xmm3
-.byte 102,15,56,0,221
+ pshufb %xmm5,%xmm3
movdqa 96(%r11),%xmm2
-.byte 102,15,56,0,212
+ pshufb %xmm4,%xmm2
pxor %xmm3,%xmm2
movdqa 112(%r11),%xmm3
-.byte 102,15,56,0,217
+ pshufb %xmm1,%xmm3
pxor %xmm2,%xmm3
addq $-16,%rdx
L$schedule_mangle_both:
movdqa (%r8,%r10,1),%xmm1
-.byte 102,15,56,0,217
+ pshufb %xmm1,%xmm3
addq $-16,%r8
andq $0x30,%r8
movdqu %xmm3,(%rdx)
@@ -969,8 +969,8 @@
movdqa L$rev_ctr(%rip),%xmm1
movdqa %xmm14,%xmm0
movdqa %xmm15,%xmm6
-.byte 102,15,56,0,193
-.byte 102,15,56,0,241
+ pshufb %xmm1,%xmm0
+ pshufb %xmm1,%xmm6
call _vpaes_encrypt_core_2x
movdqu (%rdi),%xmm1
movdqu 16(%rdi),%xmm2
diff --git a/gen/bcm/vpaes-x86_64-linux.S b/gen/bcm/vpaes-x86_64-linux.S
index 019c638..e788464 100644
--- a/gen/bcm/vpaes-x86_64-linux.S
+++ b/gen/bcm/vpaes-x86_64-linux.S
@@ -34,9 +34,9 @@
movdqu (%r9),%xmm5
psrld $4,%xmm1
pand %xmm9,%xmm0
-.byte 102,15,56,0,208
+ pshufb %xmm0,%xmm2
movdqa .Lk_ipt+16(%rip),%xmm0
-.byte 102,15,56,0,193
+ pshufb %xmm1,%xmm0
pxor %xmm5,%xmm2
addq $16,%r9
pxor %xmm2,%xmm0
@@ -48,25 +48,25 @@
movdqa %xmm13,%xmm4
movdqa %xmm12,%xmm0
-.byte 102,15,56,0,226
-.byte 102,15,56,0,195
+ pshufb %xmm2,%xmm4
+ pshufb %xmm3,%xmm0
pxor %xmm5,%xmm4
movdqa %xmm15,%xmm5
pxor %xmm4,%xmm0
movdqa -64(%r11,%r10,1),%xmm1
-.byte 102,15,56,0,234
+ pshufb %xmm2,%xmm5
movdqa (%r11,%r10,1),%xmm4
movdqa %xmm14,%xmm2
-.byte 102,15,56,0,211
+ pshufb %xmm3,%xmm2
movdqa %xmm0,%xmm3
pxor %xmm5,%xmm2
-.byte 102,15,56,0,193
+ pshufb %xmm1,%xmm0
addq $16,%r9
pxor %xmm2,%xmm0
-.byte 102,15,56,0,220
+ pshufb %xmm4,%xmm3
addq $16,%r11
pxor %xmm0,%xmm3
-.byte 102,15,56,0,193
+ pshufb %xmm1,%xmm0
andq $0x30,%r11
subq $1,%rax
pxor %xmm3,%xmm0
@@ -78,19 +78,19 @@
pandn %xmm0,%xmm1
psrld $4,%xmm1
pand %xmm9,%xmm0
-.byte 102,15,56,0,232
+ pshufb %xmm0,%xmm5
movdqa %xmm10,%xmm3
pxor %xmm1,%xmm0
-.byte 102,15,56,0,217
+ pshufb %xmm1,%xmm3
movdqa %xmm10,%xmm4
pxor %xmm5,%xmm3
-.byte 102,15,56,0,224
+ pshufb %xmm0,%xmm4
movdqa %xmm10,%xmm2
pxor %xmm5,%xmm4
-.byte 102,15,56,0,211
+ pshufb %xmm3,%xmm2
movdqa %xmm10,%xmm3
pxor %xmm0,%xmm2
-.byte 102,15,56,0,220
+ pshufb %xmm4,%xmm3
movdqu (%r9),%xmm5
pxor %xmm1,%xmm3
jnz .Lenc_loop
@@ -98,12 +98,12 @@
movdqa -96(%r10),%xmm4
movdqa -80(%r10),%xmm0
-.byte 102,15,56,0,226
+ pshufb %xmm2,%xmm4
pxor %xmm5,%xmm4
-.byte 102,15,56,0,195
+ pshufb %xmm3,%xmm0
movdqa 64(%r11,%r10,1),%xmm1
pxor %xmm4,%xmm0
-.byte 102,15,56,0,193
+ pshufb %xmm1,%xmm0
ret
.cfi_endproc
.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
@@ -156,12 +156,12 @@
psrld $4,%xmm7
pand %xmm9,%xmm0
pand %xmm9,%xmm6
-.byte 102,15,56,0,208
-.byte 102,68,15,56,0,198
+ pshufb %xmm0,%xmm2
+ pshufb %xmm6,%xmm8
movdqa .Lk_ipt+16(%rip),%xmm0
movdqa %xmm0,%xmm6
-.byte 102,15,56,0,193
-.byte 102,15,56,0,247
+ pshufb %xmm1,%xmm0
+ pshufb %xmm7,%xmm6
pxor %xmm5,%xmm2
pxor %xmm5,%xmm8
addq $16,%r9
@@ -177,10 +177,10 @@
movdqa .Lk_sb1+16(%rip),%xmm0
movdqa %xmm4,%xmm12
movdqa %xmm0,%xmm6
-.byte 102,15,56,0,226
-.byte 102,69,15,56,0,224
-.byte 102,15,56,0,195
-.byte 102,65,15,56,0,243
+ pshufb %xmm2,%xmm4
+ pshufb %xmm8,%xmm12
+ pshufb %xmm3,%xmm0
+ pshufb %xmm11,%xmm6
pxor %xmm5,%xmm4
pxor %xmm5,%xmm12
movdqa .Lk_sb2(%rip),%xmm5
@@ -189,30 +189,30 @@
pxor %xmm12,%xmm6
movdqa -64(%r11,%r10,1),%xmm1
-.byte 102,15,56,0,234
-.byte 102,69,15,56,0,232
+ pshufb %xmm2,%xmm5
+ pshufb %xmm8,%xmm13
movdqa (%r11,%r10,1),%xmm4
movdqa .Lk_sb2+16(%rip),%xmm2
movdqa %xmm2,%xmm8
-.byte 102,15,56,0,211
-.byte 102,69,15,56,0,195
+ pshufb %xmm3,%xmm2
+ pshufb %xmm11,%xmm8
movdqa %xmm0,%xmm3
movdqa %xmm6,%xmm11
pxor %xmm5,%xmm2
pxor %xmm13,%xmm8
-.byte 102,15,56,0,193
-.byte 102,15,56,0,241
+ pshufb %xmm1,%xmm0
+ pshufb %xmm1,%xmm6
addq $16,%r9
pxor %xmm2,%xmm0
pxor %xmm8,%xmm6
-.byte 102,15,56,0,220
-.byte 102,68,15,56,0,220
+ pshufb %xmm4,%xmm3
+ pshufb %xmm4,%xmm11
addq $16,%r11
pxor %xmm0,%xmm3
pxor %xmm6,%xmm11
-.byte 102,15,56,0,193
-.byte 102,15,56,0,241
+ pshufb %xmm1,%xmm0
+ pshufb %xmm1,%xmm6
andq $0x30,%r11
subq $1,%rax
pxor %xmm3,%xmm0
@@ -230,32 +230,32 @@
psrld $4,%xmm7
pand %xmm9,%xmm0
pand %xmm9,%xmm6
-.byte 102,15,56,0,232
-.byte 102,68,15,56,0,238
+ pshufb %xmm0,%xmm5
+ pshufb %xmm6,%xmm13
movdqa %xmm10,%xmm3
movdqa %xmm10,%xmm11
pxor %xmm1,%xmm0
pxor %xmm7,%xmm6
-.byte 102,15,56,0,217
-.byte 102,68,15,56,0,223
+ pshufb %xmm1,%xmm3
+ pshufb %xmm7,%xmm11
movdqa %xmm10,%xmm4
movdqa %xmm10,%xmm12
pxor %xmm5,%xmm3
pxor %xmm13,%xmm11
-.byte 102,15,56,0,224
-.byte 102,68,15,56,0,230
+ pshufb %xmm0,%xmm4
+ pshufb %xmm6,%xmm12
movdqa %xmm10,%xmm2
movdqa %xmm10,%xmm8
pxor %xmm5,%xmm4
pxor %xmm13,%xmm12
-.byte 102,15,56,0,211
-.byte 102,69,15,56,0,195
+ pshufb %xmm3,%xmm2
+ pshufb %xmm11,%xmm8
movdqa %xmm10,%xmm3
movdqa %xmm10,%xmm11
pxor %xmm0,%xmm2
pxor %xmm6,%xmm8
-.byte 102,15,56,0,220
-.byte 102,69,15,56,0,220
+ pshufb %xmm4,%xmm3
+ pshufb %xmm12,%xmm11
movdqu (%r9),%xmm5
pxor %xmm1,%xmm3
@@ -267,18 +267,18 @@
movdqa -80(%r10),%xmm0
movdqa %xmm4,%xmm12
movdqa %xmm0,%xmm6
-.byte 102,15,56,0,226
-.byte 102,69,15,56,0,224
+ pshufb %xmm2,%xmm4
+ pshufb %xmm8,%xmm12
pxor %xmm5,%xmm4
pxor %xmm5,%xmm12
-.byte 102,15,56,0,195
-.byte 102,65,15,56,0,243
+ pshufb %xmm3,%xmm0
+ pshufb %xmm11,%xmm6
movdqa 64(%r11,%r10,1),%xmm1
pxor %xmm4,%xmm0
pxor %xmm12,%xmm6
-.byte 102,15,56,0,193
-.byte 102,15,56,0,241
+ pshufb %xmm1,%xmm0
+ pshufb %xmm1,%xmm6
ret
.cfi_endproc
.size _vpaes_encrypt_core_2x,.-_vpaes_encrypt_core_2x
@@ -302,11 +302,11 @@
movdqu (%r9),%xmm5
shlq $4,%r11
pand %xmm9,%xmm0
-.byte 102,15,56,0,208
+ pshufb %xmm0,%xmm2
movdqa .Lk_dipt+16(%rip),%xmm0
xorq $0x30,%r11
leaq .Lk_dsbd(%rip),%r10
-.byte 102,15,56,0,193
+ pshufb %xmm1,%xmm0
andq $0x30,%r11
pxor %xmm5,%xmm2
movdqa .Lk_mc_forward+48(%rip),%xmm5
@@ -322,35 +322,35 @@
movdqa -32(%r10),%xmm4
movdqa -16(%r10),%xmm1
-.byte 102,15,56,0,226
-.byte 102,15,56,0,203
+ pshufb %xmm2,%xmm4
+ pshufb %xmm3,%xmm1
pxor %xmm4,%xmm0
movdqa 0(%r10),%xmm4
pxor %xmm1,%xmm0
movdqa 16(%r10),%xmm1
-.byte 102,15,56,0,226
-.byte 102,15,56,0,197
-.byte 102,15,56,0,203
+ pshufb %xmm2,%xmm4
+ pshufb %xmm5,%xmm0
+ pshufb %xmm3,%xmm1
pxor %xmm4,%xmm0
movdqa 32(%r10),%xmm4
pxor %xmm1,%xmm0
movdqa 48(%r10),%xmm1
-.byte 102,15,56,0,226
-.byte 102,15,56,0,197
-.byte 102,15,56,0,203
+ pshufb %xmm2,%xmm4
+ pshufb %xmm5,%xmm0
+ pshufb %xmm3,%xmm1
pxor %xmm4,%xmm0
movdqa 64(%r10),%xmm4
pxor %xmm1,%xmm0
movdqa 80(%r10),%xmm1
-.byte 102,15,56,0,226
-.byte 102,15,56,0,197
-.byte 102,15,56,0,203
+ pshufb %xmm2,%xmm4
+ pshufb %xmm5,%xmm0
+ pshufb %xmm3,%xmm1
pxor %xmm4,%xmm0
addq $16,%r9
-.byte 102,15,58,15,237,12
+ palignr $12,%xmm5,%xmm5
pxor %xmm1,%xmm0
subq $1,%rax
@@ -361,32 +361,32 @@
movdqa %xmm11,%xmm2
psrld $4,%xmm1
pand %xmm9,%xmm0
-.byte 102,15,56,0,208
+ pshufb %xmm0,%xmm2
movdqa %xmm10,%xmm3
pxor %xmm1,%xmm0
-.byte 102,15,56,0,217
+ pshufb %xmm1,%xmm3
movdqa %xmm10,%xmm4
pxor %xmm2,%xmm3
-.byte 102,15,56,0,224
+ pshufb %xmm0,%xmm4
pxor %xmm2,%xmm4
movdqa %xmm10,%xmm2
-.byte 102,15,56,0,211
+ pshufb %xmm3,%xmm2
movdqa %xmm10,%xmm3
pxor %xmm0,%xmm2
-.byte 102,15,56,0,220
+ pshufb %xmm4,%xmm3
movdqu (%r9),%xmm0
pxor %xmm1,%xmm3
jnz .Ldec_loop
movdqa 96(%r10),%xmm4
-.byte 102,15,56,0,226
+ pshufb %xmm2,%xmm4
pxor %xmm0,%xmm4
movdqa 112(%r10),%xmm0
movdqa -352(%r11),%xmm2
-.byte 102,15,56,0,195
+ pshufb %xmm3,%xmm0
pxor %xmm4,%xmm0
-.byte 102,15,56,0,194
+ pshufb %xmm2,%xmm0
ret
.cfi_endproc
.size _vpaes_decrypt_core,.-_vpaes_decrypt_core
@@ -426,7 +426,7 @@
.Lschedule_am_decrypting:
movdqa (%r8,%r10,1),%xmm1
-.byte 102,15,56,0,217
+ pshufb %xmm1,%xmm3
movdqu %xmm3,(%rdx)
xorq $0x30,%r8
@@ -480,7 +480,7 @@
.Loop_schedule_192:
call _vpaes_schedule_round
-.byte 102,15,58,15,198,8
+ palignr $8,%xmm6,%xmm0
call _vpaes_schedule_mangle
call _vpaes_schedule_192_smear
call _vpaes_schedule_mangle
@@ -546,7 +546,7 @@
movdqa (%r8,%r10,1),%xmm1
-.byte 102,15,56,0,193
+ pshufb %xmm1,%xmm0
leaq .Lk_opt(%rip),%r11
addq $32,%rdx
@@ -622,13 +622,13 @@
.cfi_startproc
pxor %xmm1,%xmm1
-.byte 102,65,15,58,15,200,15
-.byte 102,69,15,58,15,192,15
+ palignr $15,%xmm8,%xmm1
+ palignr $15,%xmm8,%xmm8
pxor %xmm1,%xmm7
pshufd $0xFF,%xmm0,%xmm0
-.byte 102,15,58,15,192,1
+ palignr $1,%xmm0,%xmm0
@@ -649,24 +649,24 @@
psrld $4,%xmm1
pand %xmm9,%xmm0
movdqa %xmm11,%xmm2
-.byte 102,15,56,0,208
+ pshufb %xmm0,%xmm2
pxor %xmm1,%xmm0
movdqa %xmm10,%xmm3
-.byte 102,15,56,0,217
+ pshufb %xmm1,%xmm3
pxor %xmm2,%xmm3
movdqa %xmm10,%xmm4
-.byte 102,15,56,0,224
+ pshufb %xmm0,%xmm4
pxor %xmm2,%xmm4
movdqa %xmm10,%xmm2
-.byte 102,15,56,0,211
+ pshufb %xmm3,%xmm2
pxor %xmm0,%xmm2
movdqa %xmm10,%xmm3
-.byte 102,15,56,0,220
+ pshufb %xmm4,%xmm3
pxor %xmm1,%xmm3
movdqa %xmm13,%xmm4
-.byte 102,15,56,0,226
+ pshufb %xmm2,%xmm4
movdqa %xmm12,%xmm0
-.byte 102,15,56,0,195
+ pshufb %xmm3,%xmm0
pxor %xmm4,%xmm0
@@ -694,9 +694,9 @@
psrld $4,%xmm1
pand %xmm9,%xmm0
movdqa (%r11),%xmm2
-.byte 102,15,56,0,208
+ pshufb %xmm0,%xmm2
movdqa 16(%r11),%xmm0
-.byte 102,15,56,0,193
+ pshufb %xmm1,%xmm0
pxor %xmm2,%xmm0
ret
.cfi_endproc
@@ -737,11 +737,11 @@
addq $16,%rdx
pxor .Lk_s63(%rip),%xmm4
-.byte 102,15,56,0,229
+ pshufb %xmm5,%xmm4
movdqa %xmm4,%xmm3
-.byte 102,15,56,0,229
+ pshufb %xmm5,%xmm4
pxor %xmm4,%xmm3
-.byte 102,15,56,0,229
+ pshufb %xmm5,%xmm4
pxor %xmm4,%xmm3
jmp .Lschedule_mangle_both
@@ -755,40 +755,40 @@
pand %xmm9,%xmm4
movdqa 0(%r11),%xmm2
-.byte 102,15,56,0,212
+ pshufb %xmm4,%xmm2
movdqa 16(%r11),%xmm3
-.byte 102,15,56,0,217
+ pshufb %xmm1,%xmm3
pxor %xmm2,%xmm3
-.byte 102,15,56,0,221
+ pshufb %xmm5,%xmm3
movdqa 32(%r11),%xmm2
-.byte 102,15,56,0,212
+ pshufb %xmm4,%xmm2
pxor %xmm3,%xmm2
movdqa 48(%r11),%xmm3
-.byte 102,15,56,0,217
+ pshufb %xmm1,%xmm3
pxor %xmm2,%xmm3
-.byte 102,15,56,0,221
+ pshufb %xmm5,%xmm3
movdqa 64(%r11),%xmm2
-.byte 102,15,56,0,212
+ pshufb %xmm4,%xmm2
pxor %xmm3,%xmm2
movdqa 80(%r11),%xmm3
-.byte 102,15,56,0,217
+ pshufb %xmm1,%xmm3
pxor %xmm2,%xmm3
-.byte 102,15,56,0,221
+ pshufb %xmm5,%xmm3
movdqa 96(%r11),%xmm2
-.byte 102,15,56,0,212
+ pshufb %xmm4,%xmm2
pxor %xmm3,%xmm2
movdqa 112(%r11),%xmm3
-.byte 102,15,56,0,217
+ pshufb %xmm1,%xmm3
pxor %xmm2,%xmm3
addq $-16,%rdx
.Lschedule_mangle_both:
movdqa (%r8,%r10,1),%xmm1
-.byte 102,15,56,0,217
+ pshufb %xmm1,%xmm3
addq $-16,%r8
andq $0x30,%r8
movdqu %xmm3,(%rdx)
@@ -971,8 +971,8 @@
movdqa .Lrev_ctr(%rip),%xmm1
movdqa %xmm14,%xmm0
movdqa %xmm15,%xmm6
-.byte 102,15,56,0,193
-.byte 102,15,56,0,241
+ pshufb %xmm1,%xmm0
+ pshufb %xmm1,%xmm6
call _vpaes_encrypt_core_2x
movdqu (%rdi),%xmm1
movdqu 16(%rdi),%xmm2
diff --git a/gen/bcm/vpaes-x86_64-win.asm b/gen/bcm/vpaes-x86_64-win.asm
index ddbfb12..e28ae47 100644
--- a/gen/bcm/vpaes-x86_64-win.asm
+++ b/gen/bcm/vpaes-x86_64-win.asm
@@ -42,9 +42,9 @@
movdqu xmm5,XMMWORD[r9]
psrld xmm1,4
pand xmm0,xmm9
-DB 102,15,56,0,208
+ pshufb xmm2,xmm0
movdqa xmm0,XMMWORD[(($L$k_ipt+16))]
-DB 102,15,56,0,193
+ pshufb xmm0,xmm1
pxor xmm2,xmm5
add r9,16
pxor xmm0,xmm2
@@ -56,25 +56,25 @@
movdqa xmm4,xmm13
movdqa xmm0,xmm12
-DB 102,15,56,0,226
-DB 102,15,56,0,195
+ pshufb xmm4,xmm2
+ pshufb xmm0,xmm3
pxor xmm4,xmm5
movdqa xmm5,xmm15
pxor xmm0,xmm4
movdqa xmm1,XMMWORD[((-64))+r10*1+r11]
-DB 102,15,56,0,234
+ pshufb xmm5,xmm2
movdqa xmm4,XMMWORD[r10*1+r11]
movdqa xmm2,xmm14
-DB 102,15,56,0,211
+ pshufb xmm2,xmm3
movdqa xmm3,xmm0
pxor xmm2,xmm5
-DB 102,15,56,0,193
+ pshufb xmm0,xmm1
add r9,16
pxor xmm0,xmm2
-DB 102,15,56,0,220
+ pshufb xmm3,xmm4
add r11,16
pxor xmm3,xmm0
-DB 102,15,56,0,193
+ pshufb xmm0,xmm1
and r11,0x30
sub rax,1
pxor xmm0,xmm3
@@ -86,19 +86,19 @@
pandn xmm1,xmm0
psrld xmm1,4
pand xmm0,xmm9
-DB 102,15,56,0,232
+ pshufb xmm5,xmm0
movdqa xmm3,xmm10
pxor xmm0,xmm1
-DB 102,15,56,0,217
+ pshufb xmm3,xmm1
movdqa xmm4,xmm10
pxor xmm3,xmm5
-DB 102,15,56,0,224
+ pshufb xmm4,xmm0
movdqa xmm2,xmm10
pxor xmm4,xmm5
-DB 102,15,56,0,211
+ pshufb xmm2,xmm3
movdqa xmm3,xmm10
pxor xmm2,xmm0
-DB 102,15,56,0,220
+ pshufb xmm3,xmm4
movdqu xmm5,XMMWORD[r9]
pxor xmm3,xmm1
jnz NEAR $L$enc_loop
@@ -106,12 +106,12 @@
movdqa xmm4,XMMWORD[((-96))+r10]
movdqa xmm0,XMMWORD[((-80))+r10]
-DB 102,15,56,0,226
+ pshufb xmm4,xmm2
pxor xmm4,xmm5
-DB 102,15,56,0,195
+ pshufb xmm0,xmm3
movdqa xmm1,XMMWORD[64+r10*1+r11]
pxor xmm0,xmm4
-DB 102,15,56,0,193
+ pshufb xmm0,xmm1
ret
@@ -164,12 +164,12 @@
psrld xmm7,4
pand xmm0,xmm9
pand xmm6,xmm9
-DB 102,15,56,0,208
-DB 102,68,15,56,0,198
+ pshufb xmm2,xmm0
+ pshufb xmm8,xmm6
movdqa xmm0,XMMWORD[(($L$k_ipt+16))]
movdqa xmm6,xmm0
-DB 102,15,56,0,193
-DB 102,15,56,0,247
+ pshufb xmm0,xmm1
+ pshufb xmm6,xmm7
pxor xmm2,xmm5
pxor xmm8,xmm5
add r9,16
@@ -185,10 +185,10 @@
movdqa xmm0,XMMWORD[(($L$k_sb1+16))]
movdqa xmm12,xmm4
movdqa xmm6,xmm0
-DB 102,15,56,0,226
-DB 102,69,15,56,0,224
-DB 102,15,56,0,195
-DB 102,65,15,56,0,243
+ pshufb xmm4,xmm2
+ pshufb xmm12,xmm8
+ pshufb xmm0,xmm3
+ pshufb xmm6,xmm11
pxor xmm4,xmm5
pxor xmm12,xmm5
movdqa xmm5,XMMWORD[$L$k_sb2]
@@ -197,30 +197,30 @@
pxor xmm6,xmm12
movdqa xmm1,XMMWORD[((-64))+r10*1+r11]
-DB 102,15,56,0,234
-DB 102,69,15,56,0,232
+ pshufb xmm5,xmm2
+ pshufb xmm13,xmm8
movdqa xmm4,XMMWORD[r10*1+r11]
movdqa xmm2,XMMWORD[(($L$k_sb2+16))]
movdqa xmm8,xmm2
-DB 102,15,56,0,211
-DB 102,69,15,56,0,195
+ pshufb xmm2,xmm3
+ pshufb xmm8,xmm11
movdqa xmm3,xmm0
movdqa xmm11,xmm6
pxor xmm2,xmm5
pxor xmm8,xmm13
-DB 102,15,56,0,193
-DB 102,15,56,0,241
+ pshufb xmm0,xmm1
+ pshufb xmm6,xmm1
add r9,16
pxor xmm0,xmm2
pxor xmm6,xmm8
-DB 102,15,56,0,220
-DB 102,68,15,56,0,220
+ pshufb xmm3,xmm4
+ pshufb xmm11,xmm4
add r11,16
pxor xmm3,xmm0
pxor xmm11,xmm6
-DB 102,15,56,0,193
-DB 102,15,56,0,241
+ pshufb xmm0,xmm1
+ pshufb xmm6,xmm1
and r11,0x30
sub rax,1
pxor xmm0,xmm3
@@ -238,32 +238,32 @@
psrld xmm7,4
pand xmm0,xmm9
pand xmm6,xmm9
-DB 102,15,56,0,232
-DB 102,68,15,56,0,238
+ pshufb xmm5,xmm0
+ pshufb xmm13,xmm6
movdqa xmm3,xmm10
movdqa xmm11,xmm10
pxor xmm0,xmm1
pxor xmm6,xmm7
-DB 102,15,56,0,217
-DB 102,68,15,56,0,223
+ pshufb xmm3,xmm1
+ pshufb xmm11,xmm7
movdqa xmm4,xmm10
movdqa xmm12,xmm10
pxor xmm3,xmm5
pxor xmm11,xmm13
-DB 102,15,56,0,224
-DB 102,68,15,56,0,230
+ pshufb xmm4,xmm0
+ pshufb xmm12,xmm6
movdqa xmm2,xmm10
movdqa xmm8,xmm10
pxor xmm4,xmm5
pxor xmm12,xmm13
-DB 102,15,56,0,211
-DB 102,69,15,56,0,195
+ pshufb xmm2,xmm3
+ pshufb xmm8,xmm11
movdqa xmm3,xmm10
movdqa xmm11,xmm10
pxor xmm2,xmm0
pxor xmm8,xmm6
-DB 102,15,56,0,220
-DB 102,69,15,56,0,220
+ pshufb xmm3,xmm4
+ pshufb xmm11,xmm12
movdqu xmm5,XMMWORD[r9]
pxor xmm3,xmm1
@@ -275,18 +275,18 @@
movdqa xmm0,XMMWORD[((-80))+r10]
movdqa xmm12,xmm4
movdqa xmm6,xmm0
-DB 102,15,56,0,226
-DB 102,69,15,56,0,224
+ pshufb xmm4,xmm2
+ pshufb xmm12,xmm8
pxor xmm4,xmm5
pxor xmm12,xmm5
-DB 102,15,56,0,195
-DB 102,65,15,56,0,243
+ pshufb xmm0,xmm3
+ pshufb xmm6,xmm11
movdqa xmm1,XMMWORD[64+r10*1+r11]
pxor xmm0,xmm4
pxor xmm6,xmm12
-DB 102,15,56,0,193
-DB 102,15,56,0,241
+ pshufb xmm0,xmm1
+ pshufb xmm6,xmm1
ret
@@ -310,11 +310,11 @@
movdqu xmm5,XMMWORD[r9]
shl r11,4
pand xmm0,xmm9
-DB 102,15,56,0,208
+ pshufb xmm2,xmm0
movdqa xmm0,XMMWORD[(($L$k_dipt+16))]
xor r11,0x30
lea r10,[$L$k_dsbd]
-DB 102,15,56,0,193
+ pshufb xmm0,xmm1
and r11,0x30
pxor xmm2,xmm5
movdqa xmm5,XMMWORD[(($L$k_mc_forward+48))]
@@ -330,35 +330,35 @@
movdqa xmm4,XMMWORD[((-32))+r10]
movdqa xmm1,XMMWORD[((-16))+r10]
-DB 102,15,56,0,226
-DB 102,15,56,0,203
+ pshufb xmm4,xmm2
+ pshufb xmm1,xmm3
pxor xmm0,xmm4
movdqa xmm4,XMMWORD[r10]
pxor xmm0,xmm1
movdqa xmm1,XMMWORD[16+r10]
-DB 102,15,56,0,226
-DB 102,15,56,0,197
-DB 102,15,56,0,203
+ pshufb xmm4,xmm2
+ pshufb xmm0,xmm5
+ pshufb xmm1,xmm3
pxor xmm0,xmm4
movdqa xmm4,XMMWORD[32+r10]
pxor xmm0,xmm1
movdqa xmm1,XMMWORD[48+r10]
-DB 102,15,56,0,226
-DB 102,15,56,0,197
-DB 102,15,56,0,203
+ pshufb xmm4,xmm2
+ pshufb xmm0,xmm5
+ pshufb xmm1,xmm3
pxor xmm0,xmm4
movdqa xmm4,XMMWORD[64+r10]
pxor xmm0,xmm1
movdqa xmm1,XMMWORD[80+r10]
-DB 102,15,56,0,226
-DB 102,15,56,0,197
-DB 102,15,56,0,203
+ pshufb xmm4,xmm2
+ pshufb xmm0,xmm5
+ pshufb xmm1,xmm3
pxor xmm0,xmm4
add r9,16
-DB 102,15,58,15,237,12
+ palignr xmm5,xmm5,12
pxor xmm0,xmm1
sub rax,1
@@ -369,32 +369,32 @@
movdqa xmm2,xmm11
psrld xmm1,4
pand xmm0,xmm9
-DB 102,15,56,0,208
+ pshufb xmm2,xmm0
movdqa xmm3,xmm10
pxor xmm0,xmm1
-DB 102,15,56,0,217
+ pshufb xmm3,xmm1
movdqa xmm4,xmm10
pxor xmm3,xmm2
-DB 102,15,56,0,224
+ pshufb xmm4,xmm0
pxor xmm4,xmm2
movdqa xmm2,xmm10
-DB 102,15,56,0,211
+ pshufb xmm2,xmm3
movdqa xmm3,xmm10
pxor xmm2,xmm0
-DB 102,15,56,0,220
+ pshufb xmm3,xmm4
movdqu xmm0,XMMWORD[r9]
pxor xmm3,xmm1
jnz NEAR $L$dec_loop
movdqa xmm4,XMMWORD[96+r10]
-DB 102,15,56,0,226
+ pshufb xmm4,xmm2
pxor xmm4,xmm0
movdqa xmm0,XMMWORD[112+r10]
movdqa xmm2,XMMWORD[((-352))+r11]
-DB 102,15,56,0,195
+ pshufb xmm0,xmm3
pxor xmm0,xmm4
-DB 102,15,56,0,194
+ pshufb xmm0,xmm2
ret
@@ -434,7 +434,7 @@
$L$schedule_am_decrypting:
movdqa xmm1,XMMWORD[r10*1+r8]
-DB 102,15,56,0,217
+ pshufb xmm3,xmm1
movdqu XMMWORD[rdx],xmm3
xor r8,0x30
@@ -488,7 +488,7 @@
$L$oop_schedule_192:
call _vpaes_schedule_round
-DB 102,15,58,15,198,8
+ palignr xmm0,xmm6,8
call _vpaes_schedule_mangle
call _vpaes_schedule_192_smear
call _vpaes_schedule_mangle
@@ -554,7 +554,7 @@
movdqa xmm1,XMMWORD[r10*1+r8]
-DB 102,15,56,0,193
+ pshufb xmm0,xmm1
lea r11,[$L$k_opt]
add rdx,32
@@ -630,13 +630,13 @@
pxor xmm1,xmm1
-DB 102,65,15,58,15,200,15
-DB 102,69,15,58,15,192,15
+ palignr xmm1,xmm8,15
+ palignr xmm8,xmm8,15
pxor xmm7,xmm1
pshufd xmm0,xmm0,0xFF
-DB 102,15,58,15,192,1
+ palignr xmm0,xmm0,1
@@ -657,24 +657,24 @@
psrld xmm1,4
pand xmm0,xmm9
movdqa xmm2,xmm11
-DB 102,15,56,0,208
+ pshufb xmm2,xmm0
pxor xmm0,xmm1
movdqa xmm3,xmm10
-DB 102,15,56,0,217
+ pshufb xmm3,xmm1
pxor xmm3,xmm2
movdqa xmm4,xmm10
-DB 102,15,56,0,224
+ pshufb xmm4,xmm0
pxor xmm4,xmm2
movdqa xmm2,xmm10
-DB 102,15,56,0,211
+ pshufb xmm2,xmm3
pxor xmm2,xmm0
movdqa xmm3,xmm10
-DB 102,15,56,0,220
+ pshufb xmm3,xmm4
pxor xmm3,xmm1
movdqa xmm4,xmm13
-DB 102,15,56,0,226
+ pshufb xmm4,xmm2
movdqa xmm0,xmm12
-DB 102,15,56,0,195
+ pshufb xmm0,xmm3
pxor xmm0,xmm4
@@ -702,9 +702,9 @@
psrld xmm1,4
pand xmm0,xmm9
movdqa xmm2,XMMWORD[r11]
-DB 102,15,56,0,208
+ pshufb xmm2,xmm0
movdqa xmm0,XMMWORD[16+r11]
-DB 102,15,56,0,193
+ pshufb xmm0,xmm1
pxor xmm0,xmm2
ret
@@ -745,11 +745,11 @@
add rdx,16
pxor xmm4,XMMWORD[$L$k_s63]
-DB 102,15,56,0,229
+ pshufb xmm4,xmm5
movdqa xmm3,xmm4
-DB 102,15,56,0,229
+ pshufb xmm4,xmm5
pxor xmm3,xmm4
-DB 102,15,56,0,229
+ pshufb xmm4,xmm5
pxor xmm3,xmm4
jmp NEAR $L$schedule_mangle_both
@@ -763,40 +763,40 @@
pand xmm4,xmm9
movdqa xmm2,XMMWORD[r11]
-DB 102,15,56,0,212
+ pshufb xmm2,xmm4
movdqa xmm3,XMMWORD[16+r11]
-DB 102,15,56,0,217
+ pshufb xmm3,xmm1
pxor xmm3,xmm2
-DB 102,15,56,0,221
+ pshufb xmm3,xmm5
movdqa xmm2,XMMWORD[32+r11]
-DB 102,15,56,0,212
+ pshufb xmm2,xmm4
pxor xmm2,xmm3
movdqa xmm3,XMMWORD[48+r11]
-DB 102,15,56,0,217
+ pshufb xmm3,xmm1
pxor xmm3,xmm2
-DB 102,15,56,0,221
+ pshufb xmm3,xmm5
movdqa xmm2,XMMWORD[64+r11]
-DB 102,15,56,0,212
+ pshufb xmm2,xmm4
pxor xmm2,xmm3
movdqa xmm3,XMMWORD[80+r11]
-DB 102,15,56,0,217
+ pshufb xmm3,xmm1
pxor xmm3,xmm2
-DB 102,15,56,0,221
+ pshufb xmm3,xmm5
movdqa xmm2,XMMWORD[96+r11]
-DB 102,15,56,0,212
+ pshufb xmm2,xmm4
pxor xmm2,xmm3
movdqa xmm3,XMMWORD[112+r11]
-DB 102,15,56,0,217
+ pshufb xmm3,xmm1
pxor xmm3,xmm2
add rdx,-16
$L$schedule_mangle_both:
movdqa xmm1,XMMWORD[r10*1+r8]
-DB 102,15,56,0,217
+ pshufb xmm3,xmm1
add r8,-16
and r8,0x30
movdqu XMMWORD[rdx],xmm3
@@ -1172,8 +1172,8 @@
movdqa xmm1,XMMWORD[$L$rev_ctr]
movdqa xmm0,xmm14
movdqa xmm6,xmm15
-DB 102,15,56,0,193
-DB 102,15,56,0,241
+ pshufb xmm0,xmm1
+ pshufb xmm6,xmm1
call _vpaes_encrypt_core_2x
movdqu xmm1,XMMWORD[rdi]
movdqu xmm2,XMMWORD[16+rdi]
diff --git a/gen/bcm/x86_64-mont-apple.S b/gen/bcm/x86_64-mont-apple.S
index 4bf0c6d..d429f7c 100644
--- a/gen/bcm/x86_64-mont-apple.S
+++ b/gen/bcm/x86_64-mont-apple.S
@@ -632,7 +632,7 @@
sbbq $0,%rax
movq %rbp,24(%rdi,%r14,8)
pxor %xmm0,%xmm0
-.byte 102,72,15,110,224
+ movq %rax,%xmm4
pcmpeqd %xmm5,%xmm5
pshufd $0,%xmm4,%xmm4
movq %r9,%r15
@@ -764,10 +764,10 @@
L$sqr8x_body:
-.byte 102,72,15,110,209
+ movq %rcx,%xmm2
pxor %xmm0,%xmm0
-.byte 102,72,15,110,207
-.byte 102,73,15,110,218
+ movq %rdi,%xmm1
+ movq %r10,%xmm3
testq %rdx,%rdx
jz L$sqr8x_nox
@@ -779,7 +779,7 @@
leaq (%r8,%rcx,1),%rbx
movq %rcx,%r9
movq %rcx,%rdx
-.byte 102,72,15,126,207
+ movq %xmm1,%rdi
sarq $3+2,%rcx
jmp L$sqr8x_sub
@@ -793,7 +793,7 @@
leaq (%rdi,%r9,1),%rbx
movq %r9,%rcx
movq %r9,%rdx
-.byte 102,72,15,126,207
+ movq %xmm1,%rdi
sarq $3+2,%rcx
jmp L$sqr8x_sub
@@ -821,7 +821,7 @@
leaq (%rbx,%r9,1),%rbx
leaq (%rdi,%r9,1),%rdi
-.byte 102,72,15,110,200
+ movq %rax,%xmm1
pxor %xmm0,%xmm0
pshufd $0,%xmm1,%xmm1
movq 40(%rsp),%rsi
@@ -1179,7 +1179,7 @@
leaq 64(%rsp),%rbx
subq %rdx,%rdi
-.byte 102,73,15,110,207
+ movq %r15,%xmm1
pxor %xmm0,%xmm0
pshufd $0,%xmm1,%xmm1
movq 40(%rsp),%rsi
diff --git a/gen/bcm/x86_64-mont-linux.S b/gen/bcm/x86_64-mont-linux.S
index 02b282d..630bb72 100644
--- a/gen/bcm/x86_64-mont-linux.S
+++ b/gen/bcm/x86_64-mont-linux.S
@@ -632,7 +632,7 @@
sbbq $0,%rax
movq %rbp,24(%rdi,%r14,8)
pxor %xmm0,%xmm0
-.byte 102,72,15,110,224
+ movq %rax,%xmm4
pcmpeqd %xmm5,%xmm5
pshufd $0,%xmm4,%xmm4
movq %r9,%r15
@@ -766,10 +766,10 @@
.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
.Lsqr8x_body:
-.byte 102,72,15,110,209
+ movq %rcx,%xmm2
pxor %xmm0,%xmm0
-.byte 102,72,15,110,207
-.byte 102,73,15,110,218
+ movq %rdi,%xmm1
+ movq %r10,%xmm3
testq %rdx,%rdx
jz .Lsqr8x_nox
@@ -781,7 +781,7 @@
leaq (%r8,%rcx,1),%rbx
movq %rcx,%r9
movq %rcx,%rdx
-.byte 102,72,15,126,207
+ movq %xmm1,%rdi
sarq $3+2,%rcx
jmp .Lsqr8x_sub
@@ -795,7 +795,7 @@
leaq (%rdi,%r9,1),%rbx
movq %r9,%rcx
movq %r9,%rdx
-.byte 102,72,15,126,207
+ movq %xmm1,%rdi
sarq $3+2,%rcx
jmp .Lsqr8x_sub
@@ -823,7 +823,7 @@
leaq (%rbx,%r9,1),%rbx
leaq (%rdi,%r9,1),%rdi
-.byte 102,72,15,110,200
+ movq %rax,%xmm1
pxor %xmm0,%xmm0
pshufd $0,%xmm1,%xmm1
movq 40(%rsp),%rsi
@@ -1181,7 +1181,7 @@
leaq 64(%rsp),%rbx
subq %rdx,%rdi
-.byte 102,73,15,110,207
+ movq %r15,%xmm1
pxor %xmm0,%xmm0
pshufd $0,%xmm1,%xmm1
movq 40(%rsp),%rsi
diff --git a/gen/bcm/x86_64-mont-win.asm b/gen/bcm/x86_64-mont-win.asm
index b0611fc..7e54c66 100644
--- a/gen/bcm/x86_64-mont-win.asm
+++ b/gen/bcm/x86_64-mont-win.asm
@@ -664,7 +664,7 @@
sbb rax,0
mov QWORD[24+r14*8+rdi],rbp
pxor xmm0,xmm0
-DB 102,72,15,110,224
+ movq xmm4,rax
pcmpeqd xmm5,xmm5
pshufd xmm4,xmm4,0
mov r15,r9
@@ -809,10 +809,10 @@
$L$sqr8x_body:
-DB 102,72,15,110,209
+ movq xmm2,rcx
pxor xmm0,xmm0
-DB 102,72,15,110,207
-DB 102,73,15,110,218
+ movq xmm1,rdi
+ movq xmm3,r10
test rdx,rdx
jz NEAR $L$sqr8x_nox
@@ -824,7 +824,7 @@
lea rbx,[rcx*1+r8]
mov r9,rcx
mov rdx,rcx
-DB 102,72,15,126,207
+ movq rdi,xmm1
sar rcx,3+2
jmp NEAR $L$sqr8x_sub
@@ -838,7 +838,7 @@
lea rbx,[r9*1+rdi]
mov rcx,r9
mov rdx,r9
-DB 102,72,15,126,207
+ movq rdi,xmm1
sar rcx,3+2
jmp NEAR $L$sqr8x_sub
@@ -866,7 +866,7 @@
lea rbx,[r9*1+rbx]
lea rdi,[r9*1+rdi]
-DB 102,72,15,110,200
+ movq xmm1,rax
pxor xmm0,xmm0
pshufd xmm1,xmm1,0
mov rsi,QWORD[40+rsp]
@@ -1237,7 +1237,7 @@
lea rbx,[64+rsp]
sub rdi,rdx
-DB 102,73,15,110,207
+ movq xmm1,r15
pxor xmm0,xmm0
pshufd xmm1,xmm1,0
mov rsi,QWORD[40+rsp]
diff --git a/gen/bcm/x86_64-mont5-apple.S b/gen/bcm/x86_64-mont5-apple.S
index 5cf770f..cd7d797 100644
--- a/gen/bcm/x86_64-mont5-apple.S
+++ b/gen/bcm/x86_64-mont5-apple.S
@@ -196,7 +196,7 @@
pshufd $0x4e,%xmm0,%xmm1
por %xmm1,%xmm0
leaq 256(%r12),%r12
-.byte 102,72,15,126,195
+ movq %xmm0,%rbx
movq (%r8),%r8
movq (%rsi),%rax
@@ -322,7 +322,7 @@
leaq 256(%r12),%r12
movq (%rsi),%rax
-.byte 102,72,15,126,195
+ movq %xmm0,%rbx
xorq %r15,%r15
movq %r8,%rbp
@@ -691,7 +691,7 @@
pshufd $0x4e,%xmm0,%xmm1
por %xmm1,%xmm0
leaq 256(%r12),%r12
-.byte 102,72,15,126,195
+ movq %xmm0,%rbx
movq %r13,16+8(%rsp)
movq %rdi,56+8(%rsp)
@@ -899,7 +899,7 @@
pshufd $0x4e,%xmm4,%xmm0
por %xmm4,%xmm0
leaq 256(%r12),%r12
-.byte 102,72,15,126,195
+ movq %xmm0,%rbx
movq (%r14,%r9,1),%r10
movq %r8,%rbp
@@ -1171,10 +1171,10 @@
movq %rax,40(%rsp)
L$power5_body:
-.byte 102,72,15,110,207
-.byte 102,72,15,110,209
-.byte 102,73,15,110,218
-.byte 102,72,15,110,226
+ movq %rdi,%xmm1
+ movq %rcx,%xmm2
+ movq %r10,%xmm3
+ movq %rdx,%xmm4
call __bn_sqr8x_internal
call __bn_post4x_internal
@@ -1187,8 +1187,8 @@
call __bn_sqr8x_internal
call __bn_post4x_internal
-.byte 102,72,15,126,209
-.byte 102,72,15,126,226
+ movq %xmm2,%rcx
+ movq %xmm4,%rdx
movq %rsi,%rdi
movq 40(%rsp),%rax
leaq 32(%rsp),%r8
@@ -1740,7 +1740,7 @@
adcq %rdx,%r8
movq %rbx,-16(%rdi)
movq %r8,-8(%rdi)
-.byte 102,72,15,126,213
+ movq %xmm2,%rbp
__bn_sqr8x_reduction:
xorq %rax,%rax
leaq (%r9,%rbp,1),%rcx
@@ -1984,11 +1984,11 @@
movq -8(%rbp),%rcx
xorq %rsi,%rsi
-.byte 102,72,15,126,213
+ movq %xmm2,%rbp
movq %r8,0(%rdi)
movq %r9,8(%rdi)
-.byte 102,73,15,126,217
+ movq %xmm3,%r9
movq %r10,16(%rdi)
movq %r11,24(%rdi)
movq %r12,32(%rdi)
@@ -2009,9 +2009,9 @@
movq 0(%rbp),%r12
leaq (%rdi,%r9,1),%rbx
movq %r9,%rcx
-.byte 102,72,15,126,207
+ movq %xmm1,%rdi
negq %rax
-.byte 102,72,15,126,206
+ movq %xmm1,%rsi
sarq $3+2,%rcx
decq %r12
xorq %r10,%r10
@@ -2321,7 +2321,7 @@
pshufd $0x4e,%xmm0,%xmm1
por %xmm1,%xmm0
leaq 256(%rdi),%rdi
-.byte 102,72,15,126,194
+ movq %xmm0,%rdx
leaq 64+32+8(%rsp),%rbx
movq %rdx,%r9
@@ -2472,7 +2472,7 @@
pshufd $0x4e,%xmm4,%xmm0
por %xmm4,%xmm0
leaq 256(%rdi),%rdi
-.byte 102,72,15,126,194
+ movq %xmm0,%rdx
movq %rbp,(%rbx)
leaq 32(%rbx,%rax,1),%rbx
@@ -2689,10 +2689,10 @@
pxor %xmm0,%xmm0
-.byte 102,72,15,110,207
-.byte 102,72,15,110,209
-.byte 102,73,15,110,218
-.byte 102,72,15,110,226
+ movq %rdi,%xmm1
+ movq %rcx,%xmm2
+ movq %r10,%xmm3
+ movq %rdx,%xmm4
movq %r8,32(%rsp)
movq %rax,40(%rsp)
@@ -2711,8 +2711,8 @@
movq %r10,%r9
movq %rsi,%rdi
-.byte 102,72,15,126,209
-.byte 102,72,15,126,226
+ movq %xmm2,%rcx
+ movq %xmm4,%rdx
movq 40(%rsp),%rax
call mulx4x_internal
@@ -3077,7 +3077,7 @@
.p2align 5
L$sqrx8x_outer_break:
movq %r9,72(%rdi)
-.byte 102,72,15,126,217
+ movq %xmm3,%rcx
movq %r10,80(%rdi)
movq %r11,88(%rdi)
movq %r12,96(%rdi)
@@ -3151,7 +3151,7 @@
movq %rax,48(%rdi)
movq %rbx,56(%rdi)
leaq 64(%rdi),%rdi
-.byte 102,72,15,126,213
+ movq %xmm2,%rbp
__bn_sqrx8x_reduction:
xorl %eax,%eax
movq 32+8(%rsp),%rbx
@@ -3331,10 +3331,10 @@
subq 16+8(%rsp),%rsi
L$sqrx8x_no_tail:
adcq 0(%rdi),%r8
-.byte 102,72,15,126,217
+ movq %xmm3,%rcx
adcq 8(%rdi),%r9
movq 56(%rbp),%rsi
-.byte 102,72,15,126,213
+ movq %xmm2,%rbp
adcq 16(%rdi),%r10
adcq 24(%rdi),%r11
adcq 32(%rdi),%r12
@@ -3372,8 +3372,8 @@
negq %rax
sarq $3+2,%rcx
-.byte 102,72,15,126,202
-.byte 102,72,15,126,206
+ movq %xmm1,%rdx
+ movq %xmm1,%rsi
decq %r12
movq 8(%rbp),%r13
xorq %r8,%r8
diff --git a/gen/bcm/x86_64-mont5-linux.S b/gen/bcm/x86_64-mont5-linux.S
index dcc02fc..a147041 100644
--- a/gen/bcm/x86_64-mont5-linux.S
+++ b/gen/bcm/x86_64-mont5-linux.S
@@ -196,7 +196,7 @@
pshufd $0x4e,%xmm0,%xmm1
por %xmm1,%xmm0
leaq 256(%r12),%r12
-.byte 102,72,15,126,195
+ movq %xmm0,%rbx
movq (%r8),%r8
movq (%rsi),%rax
@@ -322,7 +322,7 @@
leaq 256(%r12),%r12
movq (%rsi),%rax
-.byte 102,72,15,126,195
+ movq %xmm0,%rbx
xorq %r15,%r15
movq %r8,%rbp
@@ -691,7 +691,7 @@
pshufd $0x4e,%xmm0,%xmm1
por %xmm1,%xmm0
leaq 256(%r12),%r12
-.byte 102,72,15,126,195
+ movq %xmm0,%rbx
movq %r13,16+8(%rsp)
movq %rdi,56+8(%rsp)
@@ -899,7 +899,7 @@
pshufd $0x4e,%xmm4,%xmm0
por %xmm4,%xmm0
leaq 256(%r12),%r12
-.byte 102,72,15,126,195
+ movq %xmm0,%rbx
movq (%r14,%r9,1),%r10
movq %r8,%rbp
@@ -1171,10 +1171,10 @@
movq %rax,40(%rsp)
.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
.Lpower5_body:
-.byte 102,72,15,110,207
-.byte 102,72,15,110,209
-.byte 102,73,15,110,218
-.byte 102,72,15,110,226
+ movq %rdi,%xmm1
+ movq %rcx,%xmm2
+ movq %r10,%xmm3
+ movq %rdx,%xmm4
call __bn_sqr8x_internal
call __bn_post4x_internal
@@ -1187,8 +1187,8 @@
call __bn_sqr8x_internal
call __bn_post4x_internal
-.byte 102,72,15,126,209
-.byte 102,72,15,126,226
+ movq %xmm2,%rcx
+ movq %xmm4,%rdx
movq %rsi,%rdi
movq 40(%rsp),%rax
leaq 32(%rsp),%r8
@@ -1740,7 +1740,7 @@
adcq %rdx,%r8
movq %rbx,-16(%rdi)
movq %r8,-8(%rdi)
-.byte 102,72,15,126,213
+ movq %xmm2,%rbp
__bn_sqr8x_reduction:
xorq %rax,%rax
leaq (%r9,%rbp,1),%rcx
@@ -1984,11 +1984,11 @@
movq -8(%rbp),%rcx
xorq %rsi,%rsi
-.byte 102,72,15,126,213
+ movq %xmm2,%rbp
movq %r8,0(%rdi)
movq %r9,8(%rdi)
-.byte 102,73,15,126,217
+ movq %xmm3,%r9
movq %r10,16(%rdi)
movq %r11,24(%rdi)
movq %r12,32(%rdi)
@@ -2009,9 +2009,9 @@
movq 0(%rbp),%r12
leaq (%rdi,%r9,1),%rbx
movq %r9,%rcx
-.byte 102,72,15,126,207
+ movq %xmm1,%rdi
negq %rax
-.byte 102,72,15,126,206
+ movq %xmm1,%rsi
sarq $3+2,%rcx
decq %r12
xorq %r10,%r10
@@ -2321,7 +2321,7 @@
pshufd $0x4e,%xmm0,%xmm1
por %xmm1,%xmm0
leaq 256(%rdi),%rdi
-.byte 102,72,15,126,194
+ movq %xmm0,%rdx
leaq 64+32+8(%rsp),%rbx
movq %rdx,%r9
@@ -2472,7 +2472,7 @@
pshufd $0x4e,%xmm4,%xmm0
por %xmm4,%xmm0
leaq 256(%rdi),%rdi
-.byte 102,72,15,126,194
+ movq %xmm0,%rdx
movq %rbp,(%rbx)
leaq 32(%rbx,%rax,1),%rbx
@@ -2689,10 +2689,10 @@
pxor %xmm0,%xmm0
-.byte 102,72,15,110,207
-.byte 102,72,15,110,209
-.byte 102,73,15,110,218
-.byte 102,72,15,110,226
+ movq %rdi,%xmm1
+ movq %rcx,%xmm2
+ movq %r10,%xmm3
+ movq %rdx,%xmm4
movq %r8,32(%rsp)
movq %rax,40(%rsp)
.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
@@ -2711,8 +2711,8 @@
movq %r10,%r9
movq %rsi,%rdi
-.byte 102,72,15,126,209
-.byte 102,72,15,126,226
+ movq %xmm2,%rcx
+ movq %xmm4,%rdx
movq 40(%rsp),%rax
call mulx4x_internal
@@ -3077,7 +3077,7 @@
.align 32
.Lsqrx8x_outer_break:
movq %r9,72(%rdi)
-.byte 102,72,15,126,217
+ movq %xmm3,%rcx
movq %r10,80(%rdi)
movq %r11,88(%rdi)
movq %r12,96(%rdi)
@@ -3151,7 +3151,7 @@
movq %rax,48(%rdi)
movq %rbx,56(%rdi)
leaq 64(%rdi),%rdi
-.byte 102,72,15,126,213
+ movq %xmm2,%rbp
__bn_sqrx8x_reduction:
xorl %eax,%eax
movq 32+8(%rsp),%rbx
@@ -3331,10 +3331,10 @@
subq 16+8(%rsp),%rsi
.Lsqrx8x_no_tail:
adcq 0(%rdi),%r8
-.byte 102,72,15,126,217
+ movq %xmm3,%rcx
adcq 8(%rdi),%r9
movq 56(%rbp),%rsi
-.byte 102,72,15,126,213
+ movq %xmm2,%rbp
adcq 16(%rdi),%r10
adcq 24(%rdi),%r11
adcq 32(%rdi),%r12
@@ -3372,8 +3372,8 @@
negq %rax
sarq $3+2,%rcx
-.byte 102,72,15,126,202
-.byte 102,72,15,126,206
+ movq %xmm1,%rdx
+ movq %xmm1,%rsi
decq %r12
movq 8(%rbp),%r13
xorq %r8,%r8
diff --git a/gen/bcm/x86_64-mont5-win.asm b/gen/bcm/x86_64-mont5-win.asm
index 3b12405..5ddeb86 100644
--- a/gen/bcm/x86_64-mont5-win.asm
+++ b/gen/bcm/x86_64-mont5-win.asm
@@ -215,7 +215,7 @@
pshufd xmm1,xmm0,0x4e
por xmm0,xmm1
lea r12,[256+r12]
-DB 102,72,15,126,195
+ movq rbx,xmm0
mov r8,QWORD[r8]
mov rax,QWORD[rsi]
@@ -341,7 +341,7 @@
lea r12,[256+r12]
mov rax,QWORD[rsi]
-DB 102,72,15,126,195
+ movq rbx,xmm0
xor r15,r15
mov rbp,r8
@@ -725,7 +725,7 @@
pshufd xmm1,xmm0,0x4e
por xmm0,xmm1
lea r12,[256+r12]
-DB 102,72,15,126,195
+ movq rbx,xmm0
mov QWORD[((16+8))+rsp],r13
mov QWORD[((56+8))+rsp],rdi
@@ -933,7 +933,7 @@
pshufd xmm0,xmm4,0x4e
por xmm0,xmm4
lea r12,[256+r12]
-DB 102,72,15,126,195
+ movq rbx,xmm0
mov r10,QWORD[r9*1+r14]
mov rbp,r8
@@ -1216,10 +1216,10 @@
mov QWORD[40+rsp],rax
$L$power5_body:
-DB 102,72,15,110,207
-DB 102,72,15,110,209
-DB 102,73,15,110,218
-DB 102,72,15,110,226
+ movq xmm1,rdi
+ movq xmm2,rcx
+ movq xmm3,r10
+ movq xmm4,rdx
call __bn_sqr8x_internal
call __bn_post4x_internal
@@ -1232,8 +1232,8 @@
call __bn_sqr8x_internal
call __bn_post4x_internal
-DB 102,72,15,126,209
-DB 102,72,15,126,226
+ movq rcx,xmm2
+ movq rdx,xmm4
mov rdi,rsi
mov rax,QWORD[40+rsp]
lea r8,[32+rsp]
@@ -1786,7 +1786,7 @@
adc r8,rdx
mov QWORD[((-16))+rdi],rbx
mov QWORD[((-8))+rdi],r8
-DB 102,72,15,126,213
+ movq rbp,xmm2
__bn_sqr8x_reduction:
xor rax,rax
lea rcx,[rbp*1+r9]
@@ -2030,11 +2030,11 @@
mov rcx,QWORD[((-8))+rbp]
xor rsi,rsi
-DB 102,72,15,126,213
+ movq rbp,xmm2
mov QWORD[rdi],r8
mov QWORD[8+rdi],r9
-DB 102,73,15,126,217
+ movq r9,xmm3
mov QWORD[16+rdi],r10
mov QWORD[24+rdi],r11
mov QWORD[32+rdi],r12
@@ -2055,9 +2055,9 @@
mov r12,QWORD[rbp]
lea rbx,[r9*1+rdi]
mov rcx,r9
-DB 102,72,15,126,207
+ movq rdi,xmm1
neg rax
-DB 102,72,15,126,206
+ movq rsi,xmm1
sar rcx,3+2
dec r12
xor r10,r10
@@ -2380,7 +2380,7 @@
pshufd xmm1,xmm0,0x4e
por xmm0,xmm1
lea rdi,[256+rdi]
-DB 102,72,15,126,194
+ movq rdx,xmm0
lea rbx,[((64+32+8))+rsp]
mov r9,rdx
@@ -2531,7 +2531,7 @@
pshufd xmm0,xmm4,0x4e
por xmm0,xmm4
lea rdi,[256+rdi]
-DB 102,72,15,126,194
+ movq rdx,xmm0
mov QWORD[rbx],rbp
lea rbx,[32+rax*1+rbx]
@@ -2759,10 +2759,10 @@
pxor xmm0,xmm0
-DB 102,72,15,110,207
-DB 102,72,15,110,209
-DB 102,73,15,110,218
-DB 102,72,15,110,226
+ movq xmm1,rdi
+ movq xmm2,rcx
+ movq xmm3,r10
+ movq xmm4,rdx
mov QWORD[32+rsp],r8
mov QWORD[40+rsp],rax
@@ -2781,8 +2781,8 @@
mov r9,r10
mov rdi,rsi
-DB 102,72,15,126,209
-DB 102,72,15,126,226
+ movq rcx,xmm2
+ movq rdx,xmm4
mov rax,QWORD[40+rsp]
call mulx4x_internal
@@ -3148,7 +3148,7 @@
ALIGN 32
$L$sqrx8x_outer_break:
mov QWORD[72+rdi],r9
-DB 102,72,15,126,217
+ movq rcx,xmm3
mov QWORD[80+rdi],r10
mov QWORD[88+rdi],r11
mov QWORD[96+rdi],r12
@@ -3222,7 +3222,7 @@
mov QWORD[48+rdi],rax
mov QWORD[56+rdi],rbx
lea rdi,[64+rdi]
-DB 102,72,15,126,213
+ movq rbp,xmm2
__bn_sqrx8x_reduction:
xor eax,eax
mov rbx,QWORD[((32+8))+rsp]
@@ -3402,10 +3402,10 @@
sub rsi,QWORD[((16+8))+rsp]
$L$sqrx8x_no_tail:
adc r8,QWORD[rdi]
-DB 102,72,15,126,217
+ movq rcx,xmm3
adc r9,QWORD[8+rdi]
mov rsi,QWORD[56+rbp]
-DB 102,72,15,126,213
+ movq rbp,xmm2
adc r10,QWORD[16+rdi]
adc r11,QWORD[24+rdi]
adc r12,QWORD[32+rdi]
@@ -3443,8 +3443,8 @@
neg rax
sar rcx,3+2
-DB 102,72,15,126,202
-DB 102,72,15,126,206
+ movq rdx,xmm1
+ movq rsi,xmm1
dec r12
mov r13,QWORD[8+rbp]
xor r8,r8
diff --git a/gen/crypto/chacha-x86_64-apple.S b/gen/crypto/chacha-x86_64-apple.S
index d330661..09481e3 100644
--- a/gen/crypto/chacha-x86_64-apple.S
+++ b/gen/crypto/chacha-x86_64-apple.S
@@ -91,7 +91,7 @@
movq %rbp,64+0(%rsp)
movl $10,%ebp
movq %rsi,64+8(%rsp)
-.byte 102,72,15,126,214
+ movq %xmm2,%rsi
movq %rdi,64+16(%rsp)
movq %rsi,%rdi
shrq $32,%rdi
@@ -354,7 +354,7 @@
L$oop_ssse3:
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
-.byte 102,15,56,0,222
+ pshufb %xmm6,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm4
@@ -363,7 +363,7 @@
por %xmm4,%xmm1
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
-.byte 102,15,56,0,223
+ pshufb %xmm7,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm4
@@ -376,7 +376,7 @@
nop
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
-.byte 102,15,56,0,222
+ pshufb %xmm6,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm4
@@ -385,7 +385,7 @@
por %xmm4,%xmm1
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
-.byte 102,15,56,0,223
+ pshufb %xmm7,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm4
@@ -540,8 +540,8 @@
paddd %xmm13,%xmm9
pxor %xmm8,%xmm0
pxor %xmm9,%xmm1
-.byte 102,15,56,0,199
-.byte 102,15,56,0,207
+ pshufb %xmm7,%xmm0
+ pshufb %xmm7,%xmm1
paddd %xmm0,%xmm4
paddd %xmm1,%xmm5
pxor %xmm4,%xmm12
@@ -559,8 +559,8 @@
paddd %xmm13,%xmm9
pxor %xmm8,%xmm0
pxor %xmm9,%xmm1
-.byte 102,15,56,0,198
-.byte 102,15,56,0,206
+ pshufb %xmm6,%xmm0
+ pshufb %xmm6,%xmm1
paddd %xmm0,%xmm4
paddd %xmm1,%xmm5
pxor %xmm4,%xmm12
@@ -582,8 +582,8 @@
paddd %xmm15,%xmm11
pxor %xmm10,%xmm2
pxor %xmm11,%xmm3
-.byte 102,15,56,0,215
-.byte 102,15,56,0,223
+ pshufb %xmm7,%xmm2
+ pshufb %xmm7,%xmm3
paddd %xmm2,%xmm4
paddd %xmm3,%xmm5
pxor %xmm4,%xmm14
@@ -601,8 +601,8 @@
paddd %xmm15,%xmm11
pxor %xmm10,%xmm2
pxor %xmm11,%xmm3
-.byte 102,15,56,0,214
-.byte 102,15,56,0,222
+ pshufb %xmm6,%xmm2
+ pshufb %xmm6,%xmm3
paddd %xmm2,%xmm4
paddd %xmm3,%xmm5
pxor %xmm4,%xmm14
@@ -620,8 +620,8 @@
paddd %xmm14,%xmm9
pxor %xmm8,%xmm3
pxor %xmm9,%xmm0
-.byte 102,15,56,0,223
-.byte 102,15,56,0,199
+ pshufb %xmm7,%xmm3
+ pshufb %xmm7,%xmm0
paddd %xmm3,%xmm4
paddd %xmm0,%xmm5
pxor %xmm4,%xmm13
@@ -639,8 +639,8 @@
paddd %xmm14,%xmm9
pxor %xmm8,%xmm3
pxor %xmm9,%xmm0
-.byte 102,15,56,0,222
-.byte 102,15,56,0,198
+ pshufb %xmm6,%xmm3
+ pshufb %xmm6,%xmm0
paddd %xmm3,%xmm4
paddd %xmm0,%xmm5
pxor %xmm4,%xmm13
@@ -662,8 +662,8 @@
paddd %xmm12,%xmm11
pxor %xmm10,%xmm1
pxor %xmm11,%xmm2
-.byte 102,15,56,0,207
-.byte 102,15,56,0,215
+ pshufb %xmm7,%xmm1
+ pshufb %xmm7,%xmm2
paddd %xmm1,%xmm4
paddd %xmm2,%xmm5
pxor %xmm4,%xmm15
@@ -681,8 +681,8 @@
paddd %xmm12,%xmm11
pxor %xmm10,%xmm1
pxor %xmm11,%xmm2
-.byte 102,15,56,0,206
-.byte 102,15,56,0,214
+ pshufb %xmm6,%xmm1
+ pshufb %xmm6,%xmm2
paddd %xmm1,%xmm4
paddd %xmm2,%xmm5
pxor %xmm4,%xmm15
diff --git a/gen/crypto/chacha-x86_64-linux.S b/gen/crypto/chacha-x86_64-linux.S
index d76e6d0..8ea190d 100644
--- a/gen/crypto/chacha-x86_64-linux.S
+++ b/gen/crypto/chacha-x86_64-linux.S
@@ -97,7 +97,7 @@
movq %rbp,64+0(%rsp)
movl $10,%ebp
movq %rsi,64+8(%rsp)
-.byte 102,72,15,126,214
+ movq %xmm2,%rsi
movq %rdi,64+16(%rsp)
movq %rsi,%rdi
shrq $32,%rdi
@@ -360,7 +360,7 @@
.Loop_ssse3:
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
-.byte 102,15,56,0,222
+ pshufb %xmm6,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm4
@@ -369,7 +369,7 @@
por %xmm4,%xmm1
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
-.byte 102,15,56,0,223
+ pshufb %xmm7,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm4
@@ -382,7 +382,7 @@
nop
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
-.byte 102,15,56,0,222
+ pshufb %xmm6,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm4
@@ -391,7 +391,7 @@
por %xmm4,%xmm1
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
-.byte 102,15,56,0,223
+ pshufb %xmm7,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm4
@@ -546,8 +546,8 @@
paddd %xmm13,%xmm9
pxor %xmm8,%xmm0
pxor %xmm9,%xmm1
-.byte 102,15,56,0,199
-.byte 102,15,56,0,207
+ pshufb %xmm7,%xmm0
+ pshufb %xmm7,%xmm1
paddd %xmm0,%xmm4
paddd %xmm1,%xmm5
pxor %xmm4,%xmm12
@@ -565,8 +565,8 @@
paddd %xmm13,%xmm9
pxor %xmm8,%xmm0
pxor %xmm9,%xmm1
-.byte 102,15,56,0,198
-.byte 102,15,56,0,206
+ pshufb %xmm6,%xmm0
+ pshufb %xmm6,%xmm1
paddd %xmm0,%xmm4
paddd %xmm1,%xmm5
pxor %xmm4,%xmm12
@@ -588,8 +588,8 @@
paddd %xmm15,%xmm11
pxor %xmm10,%xmm2
pxor %xmm11,%xmm3
-.byte 102,15,56,0,215
-.byte 102,15,56,0,223
+ pshufb %xmm7,%xmm2
+ pshufb %xmm7,%xmm3
paddd %xmm2,%xmm4
paddd %xmm3,%xmm5
pxor %xmm4,%xmm14
@@ -607,8 +607,8 @@
paddd %xmm15,%xmm11
pxor %xmm10,%xmm2
pxor %xmm11,%xmm3
-.byte 102,15,56,0,214
-.byte 102,15,56,0,222
+ pshufb %xmm6,%xmm2
+ pshufb %xmm6,%xmm3
paddd %xmm2,%xmm4
paddd %xmm3,%xmm5
pxor %xmm4,%xmm14
@@ -626,8 +626,8 @@
paddd %xmm14,%xmm9
pxor %xmm8,%xmm3
pxor %xmm9,%xmm0
-.byte 102,15,56,0,223
-.byte 102,15,56,0,199
+ pshufb %xmm7,%xmm3
+ pshufb %xmm7,%xmm0
paddd %xmm3,%xmm4
paddd %xmm0,%xmm5
pxor %xmm4,%xmm13
@@ -645,8 +645,8 @@
paddd %xmm14,%xmm9
pxor %xmm8,%xmm3
pxor %xmm9,%xmm0
-.byte 102,15,56,0,222
-.byte 102,15,56,0,198
+ pshufb %xmm6,%xmm3
+ pshufb %xmm6,%xmm0
paddd %xmm3,%xmm4
paddd %xmm0,%xmm5
pxor %xmm4,%xmm13
@@ -668,8 +668,8 @@
paddd %xmm12,%xmm11
pxor %xmm10,%xmm1
pxor %xmm11,%xmm2
-.byte 102,15,56,0,207
-.byte 102,15,56,0,215
+ pshufb %xmm7,%xmm1
+ pshufb %xmm7,%xmm2
paddd %xmm1,%xmm4
paddd %xmm2,%xmm5
pxor %xmm4,%xmm15
@@ -687,8 +687,8 @@
paddd %xmm12,%xmm11
pxor %xmm10,%xmm1
pxor %xmm11,%xmm2
-.byte 102,15,56,0,206
-.byte 102,15,56,0,214
+ pshufb %xmm6,%xmm1
+ pshufb %xmm6,%xmm2
paddd %xmm1,%xmm4
paddd %xmm2,%xmm5
pxor %xmm4,%xmm15
diff --git a/gen/crypto/chacha-x86_64-win.asm b/gen/crypto/chacha-x86_64-win.asm
index f1f9a0d..f9cae3f 100644
--- a/gen/crypto/chacha-x86_64-win.asm
+++ b/gen/crypto/chacha-x86_64-win.asm
@@ -114,7 +114,7 @@
mov QWORD[((64+0))+rsp],rbp
mov ebp,10
mov QWORD[((64+8))+rsp],rsi
-DB 102,72,15,126,214
+ movq rsi,xmm2
mov QWORD[((64+16))+rsp],rdi
mov rdi,rsi
shr rdi,32
@@ -392,7 +392,7 @@
$L$oop_ssse3:
paddd xmm0,xmm1
pxor xmm3,xmm0
-DB 102,15,56,0,222
+ pshufb xmm3,xmm6
paddd xmm2,xmm3
pxor xmm1,xmm2
movdqa xmm4,xmm1
@@ -401,7 +401,7 @@
por xmm1,xmm4
paddd xmm0,xmm1
pxor xmm3,xmm0
-DB 102,15,56,0,223
+ pshufb xmm3,xmm7
paddd xmm2,xmm3
pxor xmm1,xmm2
movdqa xmm4,xmm1
@@ -414,7 +414,7 @@
nop
paddd xmm0,xmm1
pxor xmm3,xmm0
-DB 102,15,56,0,222
+ pshufb xmm3,xmm6
paddd xmm2,xmm3
pxor xmm1,xmm2
movdqa xmm4,xmm1
@@ -423,7 +423,7 @@
por xmm1,xmm4
paddd xmm0,xmm1
pxor xmm3,xmm0
-DB 102,15,56,0,223
+ pshufb xmm3,xmm7
paddd xmm2,xmm3
pxor xmm1,xmm2
movdqa xmm4,xmm1
@@ -603,8 +603,8 @@
paddd xmm9,xmm13
pxor xmm0,xmm8
pxor xmm1,xmm9
-DB 102,15,56,0,199
-DB 102,15,56,0,207
+ pshufb xmm0,xmm7
+ pshufb xmm1,xmm7
paddd xmm4,xmm0
paddd xmm5,xmm1
pxor xmm12,xmm4
@@ -622,8 +622,8 @@
paddd xmm9,xmm13
pxor xmm0,xmm8
pxor xmm1,xmm9
-DB 102,15,56,0,198
-DB 102,15,56,0,206
+ pshufb xmm0,xmm6
+ pshufb xmm1,xmm6
paddd xmm4,xmm0
paddd xmm5,xmm1
pxor xmm12,xmm4
@@ -645,8 +645,8 @@
paddd xmm11,xmm15
pxor xmm2,xmm10
pxor xmm3,xmm11
-DB 102,15,56,0,215
-DB 102,15,56,0,223
+ pshufb xmm2,xmm7
+ pshufb xmm3,xmm7
paddd xmm4,xmm2
paddd xmm5,xmm3
pxor xmm14,xmm4
@@ -664,8 +664,8 @@
paddd xmm11,xmm15
pxor xmm2,xmm10
pxor xmm3,xmm11
-DB 102,15,56,0,214
-DB 102,15,56,0,222
+ pshufb xmm2,xmm6
+ pshufb xmm3,xmm6
paddd xmm4,xmm2
paddd xmm5,xmm3
pxor xmm14,xmm4
@@ -683,8 +683,8 @@
paddd xmm9,xmm14
pxor xmm3,xmm8
pxor xmm0,xmm9
-DB 102,15,56,0,223
-DB 102,15,56,0,199
+ pshufb xmm3,xmm7
+ pshufb xmm0,xmm7
paddd xmm4,xmm3
paddd xmm5,xmm0
pxor xmm13,xmm4
@@ -702,8 +702,8 @@
paddd xmm9,xmm14
pxor xmm3,xmm8
pxor xmm0,xmm9
-DB 102,15,56,0,222
-DB 102,15,56,0,198
+ pshufb xmm3,xmm6
+ pshufb xmm0,xmm6
paddd xmm4,xmm3
paddd xmm5,xmm0
pxor xmm13,xmm4
@@ -725,8 +725,8 @@
paddd xmm11,xmm12
pxor xmm1,xmm10
pxor xmm2,xmm11
-DB 102,15,56,0,207
-DB 102,15,56,0,215
+ pshufb xmm1,xmm7
+ pshufb xmm2,xmm7
paddd xmm4,xmm1
paddd xmm5,xmm2
pxor xmm15,xmm4
@@ -744,8 +744,8 @@
paddd xmm11,xmm12
pxor xmm1,xmm10
pxor xmm2,xmm11
-DB 102,15,56,0,206
-DB 102,15,56,0,214
+ pshufb xmm1,xmm6
+ pshufb xmm2,xmm6
paddd xmm4,xmm1
paddd xmm5,xmm2
pxor xmm15,xmm4
diff --git a/gen/crypto/chacha20_poly1305_x86_64-apple.S b/gen/crypto/chacha20_poly1305_x86_64-apple.S
index a261463..4044212 100644
--- a/gen/crypto/chacha20_poly1305_x86_64-apple.S
+++ b/gen/crypto/chacha20_poly1305_x86_64-apple.S
@@ -278,9 +278,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
+ palignr $4,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $12,%xmm12,%xmm12
paddd %xmm4,%xmm0
pxor %xmm0,%xmm12
pshufb L$rol16(%rip),%xmm12
@@ -299,9 +299,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
+ palignr $12,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $4,%xmm12,%xmm12
decq %r10
jne L$open_sse_init_rounds
@@ -359,10 +359,10 @@
pxor %xmm2,%xmm14
pxor %xmm1,%xmm13
pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
+ pshufb %xmm8,%xmm15
+ pshufb %xmm8,%xmm14
+ pshufb %xmm8,%xmm13
+ pshufb %xmm8,%xmm12
movdqa 0+80(%rbp),%xmm8
paddd %xmm15,%xmm11
paddd %xmm14,%xmm10
@@ -413,10 +413,10 @@
pxor %xmm2,%xmm14
pxor %xmm1,%xmm13
pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
+ pshufb %xmm8,%xmm15
+ pshufb %xmm8,%xmm14
+ pshufb %xmm8,%xmm13
+ pshufb %xmm8,%xmm12
movdqa 0+80(%rbp),%xmm8
paddd %xmm15,%xmm11
paddd %xmm14,%xmm10
@@ -457,18 +457,18 @@
imulq %r12,%r9
addq %r10,%r15
adcq %rdx,%r9
-.byte 102,15,58,15,255,4
-.byte 102,69,15,58,15,219,8
-.byte 102,69,15,58,15,255,12
-.byte 102,15,58,15,246,4
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,12
-.byte 102,15,58,15,237,4
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,12
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
+ palignr $4,%xmm7,%xmm7
+ palignr $8,%xmm11,%xmm11
+ palignr $12,%xmm15,%xmm15
+ palignr $4,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $12,%xmm14,%xmm14
+ palignr $4,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $12,%xmm13,%xmm13
+ palignr $4,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $12,%xmm12,%xmm12
movdqa %xmm8,0+80(%rbp)
movdqa L$rol16(%rip),%xmm8
paddd %xmm7,%xmm3
@@ -493,10 +493,10 @@
adcq $0,%r12
pxor %xmm1,%xmm13
pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
+ pshufb %xmm8,%xmm15
+ pshufb %xmm8,%xmm14
+ pshufb %xmm8,%xmm13
+ pshufb %xmm8,%xmm12
movdqa 0+80(%rbp),%xmm8
paddd %xmm15,%xmm11
paddd %xmm14,%xmm10
@@ -532,10 +532,10 @@
pxor %xmm2,%xmm14
pxor %xmm1,%xmm13
pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
+ pshufb %xmm8,%xmm15
+ pshufb %xmm8,%xmm14
+ pshufb %xmm8,%xmm13
+ pshufb %xmm8,%xmm12
movdqa 0+80(%rbp),%xmm8
paddd %xmm15,%xmm11
paddd %xmm14,%xmm10
@@ -563,18 +563,18 @@
pslld $32-25,%xmm4
pxor %xmm8,%xmm4
movdqa 0+80(%rbp),%xmm8
-.byte 102,15,58,15,255,12
-.byte 102,69,15,58,15,219,8
-.byte 102,69,15,58,15,255,4
-.byte 102,15,58,15,246,12
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,4
-.byte 102,15,58,15,237,12
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
+ palignr $12,%xmm7,%xmm7
+ palignr $8,%xmm11,%xmm11
+ palignr $4,%xmm15,%xmm15
+ palignr $12,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $4,%xmm14,%xmm14
+ palignr $12,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $4,%xmm13,%xmm13
+ palignr $12,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $4,%xmm12,%xmm12
decq %rcx
jge L$open_sse_main_loop_rounds
@@ -776,9 +776,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
+ palignr $4,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $12,%xmm12,%xmm12
paddd %xmm4,%xmm0
pxor %xmm0,%xmm12
pshufb L$rol16(%rip),%xmm12
@@ -797,9 +797,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
+ palignr $12,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $4,%xmm12,%xmm12
cmpq $16,%rcx
jae L$open_sse_tail_64_rounds_and_x1hash
@@ -891,9 +891,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
+ palignr $4,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $12,%xmm12,%xmm12
paddd %xmm5,%xmm1
pxor %xmm1,%xmm13
pshufb L$rol16(%rip),%xmm13
@@ -912,9 +912,9 @@
pslld $7,%xmm3
psrld $25,%xmm5
pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,4
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,12
+ palignr $4,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $12,%xmm13,%xmm13
paddd %xmm4,%xmm0
pxor %xmm0,%xmm12
pshufb L$rol16(%rip),%xmm12
@@ -933,9 +933,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
+ palignr $12,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $4,%xmm12,%xmm12
paddd %xmm5,%xmm1
pxor %xmm1,%xmm13
pshufb L$rol16(%rip),%xmm13
@@ -954,9 +954,9 @@
pslld $7,%xmm3
psrld $25,%xmm5
pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,12
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,4
+ palignr $12,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $4,%xmm13,%xmm13
cmpq %rcx,%r8
jb L$open_sse_tail_128_rounds_and_x1hash
@@ -1076,9 +1076,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
+ palignr $4,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $12,%xmm12,%xmm12
paddd %xmm5,%xmm1
pxor %xmm1,%xmm13
pshufb L$rol16(%rip),%xmm13
@@ -1097,9 +1097,9 @@
pslld $7,%xmm3
psrld $25,%xmm5
pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,4
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,12
+ palignr $4,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $12,%xmm13,%xmm13
paddd %xmm6,%xmm2
pxor %xmm2,%xmm14
pshufb L$rol16(%rip),%xmm14
@@ -1118,9 +1118,9 @@
pslld $7,%xmm3
psrld $25,%xmm6
pxor %xmm3,%xmm6
-.byte 102,15,58,15,246,4
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,12
+ palignr $4,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $12,%xmm14,%xmm14
paddd %xmm4,%xmm0
pxor %xmm0,%xmm12
pshufb L$rol16(%rip),%xmm12
@@ -1139,9 +1139,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
+ palignr $12,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $4,%xmm12,%xmm12
paddd %xmm5,%xmm1
pxor %xmm1,%xmm13
pshufb L$rol16(%rip),%xmm13
@@ -1160,9 +1160,9 @@
pslld $7,%xmm3
psrld $25,%xmm5
pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,12
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,4
+ palignr $12,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $4,%xmm13,%xmm13
paddd %xmm6,%xmm2
pxor %xmm2,%xmm14
pshufb L$rol16(%rip),%xmm14
@@ -1181,9 +1181,9 @@
pslld $7,%xmm3
psrld $25,%xmm6
pxor %xmm3,%xmm6
-.byte 102,15,58,15,246,12
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,4
+ palignr $12,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $4,%xmm14,%xmm14
cmpq %rcx,%r8
jb L$open_sse_tail_192_rounds_and_x1hash
@@ -1368,9 +1368,9 @@
pslld $7,%xmm11
psrld $25,%xmm4
pxor %xmm11,%xmm4
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
+ palignr $4,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $12,%xmm12,%xmm12
paddd %xmm5,%xmm1
pxor %xmm1,%xmm13
pshufb L$rol16(%rip),%xmm13
@@ -1389,9 +1389,9 @@
pslld $7,%xmm11
psrld $25,%xmm5
pxor %xmm11,%xmm5
-.byte 102,15,58,15,237,4
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,12
+ palignr $4,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $12,%xmm13,%xmm13
paddd %xmm6,%xmm2
pxor %xmm2,%xmm14
pshufb L$rol16(%rip),%xmm14
@@ -1410,9 +1410,9 @@
pslld $7,%xmm11
psrld $25,%xmm6
pxor %xmm11,%xmm6
-.byte 102,15,58,15,246,4
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,12
+ palignr $4,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $12,%xmm14,%xmm14
movdqa 0+80(%rbp),%xmm11
movq 0+0+0(%rbp),%rax
movq %rax,%r15
@@ -1443,9 +1443,9 @@
pslld $7,%xmm9
psrld $25,%xmm7
pxor %xmm9,%xmm7
-.byte 102,15,58,15,255,4
-.byte 102,69,15,58,15,219,8
-.byte 102,69,15,58,15,255,12
+ palignr $4,%xmm7,%xmm7
+ palignr $8,%xmm11,%xmm11
+ palignr $12,%xmm15,%xmm15
movdqa 0+80(%rbp),%xmm9
movq 8+0+0(%rbp),%rax
movq %rax,%r9
@@ -1476,9 +1476,9 @@
pslld $7,%xmm11
psrld $25,%xmm4
pxor %xmm11,%xmm4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
+ palignr $12,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $4,%xmm12,%xmm12
paddd %xmm5,%xmm1
pxor %xmm1,%xmm13
pshufb L$rol16(%rip),%xmm13
@@ -1497,9 +1497,9 @@
pslld $7,%xmm11
psrld $25,%xmm5
pxor %xmm11,%xmm5
-.byte 102,15,58,15,237,12
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,4
+ palignr $12,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $4,%xmm13,%xmm13
imulq %r12,%r9
addq %r10,%r15
adcq %rdx,%r9
@@ -1521,9 +1521,9 @@
pslld $7,%xmm11
psrld $25,%xmm6
pxor %xmm11,%xmm6
-.byte 102,15,58,15,246,12
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,4
+ palignr $12,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $4,%xmm14,%xmm14
movdqa 0+80(%rbp),%xmm11
movq %r13,%r10
movq %r14,%r11
@@ -1558,9 +1558,9 @@
pslld $7,%xmm9
psrld $25,%xmm7
pxor %xmm9,%xmm7
-.byte 102,15,58,15,255,12
-.byte 102,69,15,58,15,219,8
-.byte 102,69,15,58,15,255,4
+ palignr $12,%xmm7,%xmm7
+ palignr $8,%xmm11,%xmm11
+ palignr $4,%xmm15,%xmm15
movdqa 0+80(%rbp),%xmm9
addq $16,%r8
@@ -1707,7 +1707,7 @@
subq $1,%r8
jnz L$open_sse_tail_16_compose
-.byte 102,73,15,126,221
+ movq %xmm3,%r13
pextrq $1,%xmm3,%r14
pxor %xmm1,%xmm3
@@ -1880,9 +1880,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
+ palignr $4,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $12,%xmm12,%xmm12
paddd %xmm5,%xmm1
pxor %xmm1,%xmm13
pshufb L$rol16(%rip),%xmm13
@@ -1901,9 +1901,9 @@
pslld $7,%xmm3
psrld $25,%xmm5
pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,4
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,12
+ palignr $4,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $12,%xmm13,%xmm13
paddd %xmm6,%xmm2
pxor %xmm2,%xmm14
pshufb L$rol16(%rip),%xmm14
@@ -1922,9 +1922,9 @@
pslld $7,%xmm3
psrld $25,%xmm6
pxor %xmm3,%xmm6
-.byte 102,15,58,15,246,4
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,12
+ palignr $4,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $12,%xmm14,%xmm14
paddd %xmm4,%xmm0
pxor %xmm0,%xmm12
pshufb L$rol16(%rip),%xmm12
@@ -1943,9 +1943,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
+ palignr $12,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $4,%xmm12,%xmm12
paddd %xmm5,%xmm1
pxor %xmm1,%xmm13
pshufb L$rol16(%rip),%xmm13
@@ -1964,9 +1964,9 @@
pslld $7,%xmm3
psrld $25,%xmm5
pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,12
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,4
+ palignr $12,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $4,%xmm13,%xmm13
paddd %xmm6,%xmm2
pxor %xmm2,%xmm14
pshufb L$rol16(%rip),%xmm14
@@ -1985,9 +1985,9 @@
pslld $7,%xmm3
psrld $25,%xmm6
pxor %xmm3,%xmm6
-.byte 102,15,58,15,246,12
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,4
+ palignr $12,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $4,%xmm14,%xmm14
decq %r10
jnz L$open_sse_128_rounds
@@ -2155,10 +2155,10 @@
pxor %xmm2,%xmm14
pxor %xmm1,%xmm13
pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
+ pshufb %xmm8,%xmm15
+ pshufb %xmm8,%xmm14
+ pshufb %xmm8,%xmm13
+ pshufb %xmm8,%xmm12
movdqa 0+80(%rbp),%xmm8
paddd %xmm15,%xmm11
paddd %xmm14,%xmm10
@@ -2194,10 +2194,10 @@
pxor %xmm2,%xmm14
pxor %xmm1,%xmm13
pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
+ pshufb %xmm8,%xmm15
+ pshufb %xmm8,%xmm14
+ pshufb %xmm8,%xmm13
+ pshufb %xmm8,%xmm12
movdqa 0+80(%rbp),%xmm8
paddd %xmm15,%xmm11
paddd %xmm14,%xmm10
@@ -2225,18 +2225,18 @@
pslld $32-25,%xmm4
pxor %xmm8,%xmm4
movdqa 0+80(%rbp),%xmm8
-.byte 102,15,58,15,255,4
-.byte 102,69,15,58,15,219,8
-.byte 102,69,15,58,15,255,12
-.byte 102,15,58,15,246,4
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,12
-.byte 102,15,58,15,237,4
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,12
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
+ palignr $4,%xmm7,%xmm7
+ palignr $8,%xmm11,%xmm11
+ palignr $12,%xmm15,%xmm15
+ palignr $4,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $12,%xmm14,%xmm14
+ palignr $4,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $12,%xmm13,%xmm13
+ palignr $4,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $12,%xmm12,%xmm12
movdqa %xmm8,0+80(%rbp)
movdqa L$rol16(%rip),%xmm8
paddd %xmm7,%xmm3
@@ -2247,10 +2247,10 @@
pxor %xmm2,%xmm14
pxor %xmm1,%xmm13
pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
+ pshufb %xmm8,%xmm15
+ pshufb %xmm8,%xmm14
+ pshufb %xmm8,%xmm13
+ pshufb %xmm8,%xmm12
movdqa 0+80(%rbp),%xmm8
paddd %xmm15,%xmm11
paddd %xmm14,%xmm10
@@ -2286,10 +2286,10 @@
pxor %xmm2,%xmm14
pxor %xmm1,%xmm13
pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
+ pshufb %xmm8,%xmm15
+ pshufb %xmm8,%xmm14
+ pshufb %xmm8,%xmm13
+ pshufb %xmm8,%xmm12
movdqa 0+80(%rbp),%xmm8
paddd %xmm15,%xmm11
paddd %xmm14,%xmm10
@@ -2317,18 +2317,18 @@
pslld $32-25,%xmm4
pxor %xmm8,%xmm4
movdqa 0+80(%rbp),%xmm8
-.byte 102,15,58,15,255,12
-.byte 102,69,15,58,15,219,8
-.byte 102,69,15,58,15,255,4
-.byte 102,15,58,15,246,12
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,4
-.byte 102,15,58,15,237,12
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
+ palignr $12,%xmm7,%xmm7
+ palignr $8,%xmm11,%xmm11
+ palignr $4,%xmm15,%xmm15
+ palignr $12,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $4,%xmm14,%xmm14
+ palignr $12,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $4,%xmm13,%xmm13
+ palignr $12,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $4,%xmm12,%xmm12
decq %r10
jnz L$seal_sse_init_rounds
@@ -2451,10 +2451,10 @@
pxor %xmm2,%xmm14
pxor %xmm1,%xmm13
pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
+ pshufb %xmm8,%xmm15
+ pshufb %xmm8,%xmm14
+ pshufb %xmm8,%xmm13
+ pshufb %xmm8,%xmm12
movdqa 0+80(%rbp),%xmm8
paddd %xmm15,%xmm11
paddd %xmm14,%xmm10
@@ -2503,10 +2503,10 @@
pxor %xmm2,%xmm14
pxor %xmm1,%xmm13
pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
+ pshufb %xmm8,%xmm15
+ pshufb %xmm8,%xmm14
+ pshufb %xmm8,%xmm13
+ pshufb %xmm8,%xmm12
movdqa 0+80(%rbp),%xmm8
paddd %xmm15,%xmm11
paddd %xmm14,%xmm10
@@ -2547,18 +2547,18 @@
imulq %r12,%r9
addq %r10,%r15
adcq %rdx,%r9
-.byte 102,15,58,15,255,4
-.byte 102,69,15,58,15,219,8
-.byte 102,69,15,58,15,255,12
-.byte 102,15,58,15,246,4
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,12
-.byte 102,15,58,15,237,4
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,12
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
+ palignr $4,%xmm7,%xmm7
+ palignr $8,%xmm11,%xmm11
+ palignr $12,%xmm15,%xmm15
+ palignr $4,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $12,%xmm14,%xmm14
+ palignr $4,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $12,%xmm13,%xmm13
+ palignr $4,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $12,%xmm12,%xmm12
movdqa %xmm8,0+80(%rbp)
movdqa L$rol16(%rip),%xmm8
paddd %xmm7,%xmm3
@@ -2583,10 +2583,10 @@
adcq $0,%r12
pxor %xmm1,%xmm13
pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
+ pshufb %xmm8,%xmm15
+ pshufb %xmm8,%xmm14
+ pshufb %xmm8,%xmm13
+ pshufb %xmm8,%xmm12
movdqa 0+80(%rbp),%xmm8
paddd %xmm15,%xmm11
paddd %xmm14,%xmm10
@@ -2622,10 +2622,10 @@
pxor %xmm2,%xmm14
pxor %xmm1,%xmm13
pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
+ pshufb %xmm8,%xmm15
+ pshufb %xmm8,%xmm14
+ pshufb %xmm8,%xmm13
+ pshufb %xmm8,%xmm12
movdqa 0+80(%rbp),%xmm8
paddd %xmm15,%xmm11
paddd %xmm14,%xmm10
@@ -2653,18 +2653,18 @@
pslld $32-25,%xmm4
pxor %xmm8,%xmm4
movdqa 0+80(%rbp),%xmm8
-.byte 102,15,58,15,255,12
-.byte 102,69,15,58,15,219,8
-.byte 102,69,15,58,15,255,4
-.byte 102,15,58,15,246,12
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,4
-.byte 102,15,58,15,237,12
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
+ palignr $12,%xmm7,%xmm7
+ palignr $8,%xmm11,%xmm11
+ palignr $4,%xmm15,%xmm15
+ palignr $12,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $4,%xmm14,%xmm14
+ palignr $12,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $4,%xmm13,%xmm13
+ palignr $12,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $4,%xmm12,%xmm12
leaq 16(%rdi),%rdi
decq %r8
@@ -2877,9 +2877,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
+ palignr $4,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $12,%xmm12,%xmm12
paddd %xmm4,%xmm0
pxor %xmm0,%xmm12
pshufb L$rol16(%rip),%xmm12
@@ -2898,9 +2898,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
+ palignr $12,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $4,%xmm12,%xmm12
addq 0+0(%rdi),%r10
adcq 8+0(%rdi),%r11
adcq $1,%r12
@@ -3030,9 +3030,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
+ palignr $4,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $12,%xmm12,%xmm12
paddd %xmm5,%xmm1
pxor %xmm1,%xmm13
pshufb L$rol16(%rip),%xmm13
@@ -3051,9 +3051,9 @@
pslld $7,%xmm3
psrld $25,%xmm5
pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,4
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,12
+ palignr $4,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $12,%xmm13,%xmm13
addq 0+0(%rdi),%r10
adcq 8+0(%rdi),%r11
adcq $1,%r12
@@ -3112,9 +3112,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
+ palignr $12,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $4,%xmm12,%xmm12
paddd %xmm5,%xmm1
pxor %xmm1,%xmm13
pshufb L$rol16(%rip),%xmm13
@@ -3133,9 +3133,9 @@
pslld $7,%xmm3
psrld $25,%xmm5
pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,12
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,4
+ palignr $12,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $4,%xmm13,%xmm13
leaq 16(%rdi),%rdi
decq %rcx
@@ -3250,9 +3250,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
+ palignr $4,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $12,%xmm12,%xmm12
paddd %xmm5,%xmm1
pxor %xmm1,%xmm13
pshufb L$rol16(%rip),%xmm13
@@ -3271,9 +3271,9 @@
pslld $7,%xmm3
psrld $25,%xmm5
pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,4
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,12
+ palignr $4,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $12,%xmm13,%xmm13
paddd %xmm6,%xmm2
pxor %xmm2,%xmm14
pshufb L$rol16(%rip),%xmm14
@@ -3292,9 +3292,9 @@
pslld $7,%xmm3
psrld $25,%xmm6
pxor %xmm3,%xmm6
-.byte 102,15,58,15,246,4
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,12
+ palignr $4,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $12,%xmm14,%xmm14
addq 0+0(%rdi),%r10
adcq 8+0(%rdi),%r11
adcq $1,%r12
@@ -3353,9 +3353,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
+ palignr $12,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $4,%xmm12,%xmm12
paddd %xmm5,%xmm1
pxor %xmm1,%xmm13
pshufb L$rol16(%rip),%xmm13
@@ -3374,9 +3374,9 @@
pslld $7,%xmm3
psrld $25,%xmm5
pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,12
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,4
+ palignr $12,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $4,%xmm13,%xmm13
paddd %xmm6,%xmm2
pxor %xmm2,%xmm14
pshufb L$rol16(%rip),%xmm14
@@ -3395,9 +3395,9 @@
pslld $7,%xmm3
psrld $25,%xmm6
pxor %xmm3,%xmm6
-.byte 102,15,58,15,246,12
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,4
+ palignr $12,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $4,%xmm14,%xmm14
leaq 16(%rdi),%rdi
decq %rcx
@@ -3649,7 +3649,7 @@
-.byte 102,77,15,126,253
+ movq %xmm15,%r13
pextrq $1,%xmm15,%r14
addq %r13,%r10
adcq %r14,%r11
@@ -3765,7 +3765,7 @@
leaq L$and_masks(%rip),%r15
shlq $4,%rbx
pand -16(%r15,%rbx,1),%xmm15
-.byte 102,77,15,126,253
+ movq %xmm15,%r13
pextrq $1,%xmm15,%r14
addq %r13,%r10
adcq %r14,%r11
@@ -3927,9 +3927,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
+ palignr $4,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $12,%xmm12,%xmm12
paddd %xmm5,%xmm1
pxor %xmm1,%xmm13
pshufb L$rol16(%rip),%xmm13
@@ -3948,9 +3948,9 @@
pslld $7,%xmm3
psrld $25,%xmm5
pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,4
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,12
+ palignr $4,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $12,%xmm13,%xmm13
paddd %xmm6,%xmm2
pxor %xmm2,%xmm14
pshufb L$rol16(%rip),%xmm14
@@ -3969,9 +3969,9 @@
pslld $7,%xmm3
psrld $25,%xmm6
pxor %xmm3,%xmm6
-.byte 102,15,58,15,246,4
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,12
+ palignr $4,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $12,%xmm14,%xmm14
paddd %xmm4,%xmm0
pxor %xmm0,%xmm12
pshufb L$rol16(%rip),%xmm12
@@ -3990,9 +3990,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
+ palignr $12,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $4,%xmm12,%xmm12
paddd %xmm5,%xmm1
pxor %xmm1,%xmm13
pshufb L$rol16(%rip),%xmm13
@@ -4011,9 +4011,9 @@
pslld $7,%xmm3
psrld $25,%xmm5
pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,12
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,4
+ palignr $12,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $4,%xmm13,%xmm13
paddd %xmm6,%xmm2
pxor %xmm2,%xmm14
pshufb L$rol16(%rip),%xmm14
@@ -4032,9 +4032,9 @@
pslld $7,%xmm3
psrld $25,%xmm6
pxor %xmm3,%xmm6
-.byte 102,15,58,15,246,12
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,4
+ palignr $12,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $4,%xmm14,%xmm14
decq %r10
jnz L$seal_sse_128_rounds
diff --git a/gen/crypto/chacha20_poly1305_x86_64-linux.S b/gen/crypto/chacha20_poly1305_x86_64-linux.S
index 180b41e..6fd94c8 100644
--- a/gen/crypto/chacha20_poly1305_x86_64-linux.S
+++ b/gen/crypto/chacha20_poly1305_x86_64-linux.S
@@ -285,9 +285,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
+ palignr $4,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $12,%xmm12,%xmm12
paddd %xmm4,%xmm0
pxor %xmm0,%xmm12
pshufb .Lrol16(%rip),%xmm12
@@ -306,9 +306,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
+ palignr $12,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $4,%xmm12,%xmm12
decq %r10
jne .Lopen_sse_init_rounds
@@ -366,10 +366,10 @@
pxor %xmm2,%xmm14
pxor %xmm1,%xmm13
pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
+ pshufb %xmm8,%xmm15
+ pshufb %xmm8,%xmm14
+ pshufb %xmm8,%xmm13
+ pshufb %xmm8,%xmm12
movdqa 0+80(%rbp),%xmm8
paddd %xmm15,%xmm11
paddd %xmm14,%xmm10
@@ -420,10 +420,10 @@
pxor %xmm2,%xmm14
pxor %xmm1,%xmm13
pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
+ pshufb %xmm8,%xmm15
+ pshufb %xmm8,%xmm14
+ pshufb %xmm8,%xmm13
+ pshufb %xmm8,%xmm12
movdqa 0+80(%rbp),%xmm8
paddd %xmm15,%xmm11
paddd %xmm14,%xmm10
@@ -464,18 +464,18 @@
imulq %r12,%r9
addq %r10,%r15
adcq %rdx,%r9
-.byte 102,15,58,15,255,4
-.byte 102,69,15,58,15,219,8
-.byte 102,69,15,58,15,255,12
-.byte 102,15,58,15,246,4
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,12
-.byte 102,15,58,15,237,4
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,12
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
+ palignr $4,%xmm7,%xmm7
+ palignr $8,%xmm11,%xmm11
+ palignr $12,%xmm15,%xmm15
+ palignr $4,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $12,%xmm14,%xmm14
+ palignr $4,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $12,%xmm13,%xmm13
+ palignr $4,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $12,%xmm12,%xmm12
movdqa %xmm8,0+80(%rbp)
movdqa .Lrol16(%rip),%xmm8
paddd %xmm7,%xmm3
@@ -500,10 +500,10 @@
adcq $0,%r12
pxor %xmm1,%xmm13
pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
+ pshufb %xmm8,%xmm15
+ pshufb %xmm8,%xmm14
+ pshufb %xmm8,%xmm13
+ pshufb %xmm8,%xmm12
movdqa 0+80(%rbp),%xmm8
paddd %xmm15,%xmm11
paddd %xmm14,%xmm10
@@ -539,10 +539,10 @@
pxor %xmm2,%xmm14
pxor %xmm1,%xmm13
pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
+ pshufb %xmm8,%xmm15
+ pshufb %xmm8,%xmm14
+ pshufb %xmm8,%xmm13
+ pshufb %xmm8,%xmm12
movdqa 0+80(%rbp),%xmm8
paddd %xmm15,%xmm11
paddd %xmm14,%xmm10
@@ -570,18 +570,18 @@
pslld $32-25,%xmm4
pxor %xmm8,%xmm4
movdqa 0+80(%rbp),%xmm8
-.byte 102,15,58,15,255,12
-.byte 102,69,15,58,15,219,8
-.byte 102,69,15,58,15,255,4
-.byte 102,15,58,15,246,12
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,4
-.byte 102,15,58,15,237,12
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
+ palignr $12,%xmm7,%xmm7
+ palignr $8,%xmm11,%xmm11
+ palignr $4,%xmm15,%xmm15
+ palignr $12,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $4,%xmm14,%xmm14
+ palignr $12,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $4,%xmm13,%xmm13
+ palignr $12,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $4,%xmm12,%xmm12
decq %rcx
jge .Lopen_sse_main_loop_rounds
@@ -783,9 +783,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
+ palignr $4,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $12,%xmm12,%xmm12
paddd %xmm4,%xmm0
pxor %xmm0,%xmm12
pshufb .Lrol16(%rip),%xmm12
@@ -804,9 +804,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
+ palignr $12,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $4,%xmm12,%xmm12
cmpq $16,%rcx
jae .Lopen_sse_tail_64_rounds_and_x1hash
@@ -898,9 +898,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
+ palignr $4,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $12,%xmm12,%xmm12
paddd %xmm5,%xmm1
pxor %xmm1,%xmm13
pshufb .Lrol16(%rip),%xmm13
@@ -919,9 +919,9 @@
pslld $7,%xmm3
psrld $25,%xmm5
pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,4
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,12
+ palignr $4,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $12,%xmm13,%xmm13
paddd %xmm4,%xmm0
pxor %xmm0,%xmm12
pshufb .Lrol16(%rip),%xmm12
@@ -940,9 +940,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
+ palignr $12,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $4,%xmm12,%xmm12
paddd %xmm5,%xmm1
pxor %xmm1,%xmm13
pshufb .Lrol16(%rip),%xmm13
@@ -961,9 +961,9 @@
pslld $7,%xmm3
psrld $25,%xmm5
pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,12
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,4
+ palignr $12,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $4,%xmm13,%xmm13
cmpq %rcx,%r8
jb .Lopen_sse_tail_128_rounds_and_x1hash
@@ -1083,9 +1083,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
+ palignr $4,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $12,%xmm12,%xmm12
paddd %xmm5,%xmm1
pxor %xmm1,%xmm13
pshufb .Lrol16(%rip),%xmm13
@@ -1104,9 +1104,9 @@
pslld $7,%xmm3
psrld $25,%xmm5
pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,4
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,12
+ palignr $4,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $12,%xmm13,%xmm13
paddd %xmm6,%xmm2
pxor %xmm2,%xmm14
pshufb .Lrol16(%rip),%xmm14
@@ -1125,9 +1125,9 @@
pslld $7,%xmm3
psrld $25,%xmm6
pxor %xmm3,%xmm6
-.byte 102,15,58,15,246,4
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,12
+ palignr $4,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $12,%xmm14,%xmm14
paddd %xmm4,%xmm0
pxor %xmm0,%xmm12
pshufb .Lrol16(%rip),%xmm12
@@ -1146,9 +1146,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
+ palignr $12,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $4,%xmm12,%xmm12
paddd %xmm5,%xmm1
pxor %xmm1,%xmm13
pshufb .Lrol16(%rip),%xmm13
@@ -1167,9 +1167,9 @@
pslld $7,%xmm3
psrld $25,%xmm5
pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,12
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,4
+ palignr $12,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $4,%xmm13,%xmm13
paddd %xmm6,%xmm2
pxor %xmm2,%xmm14
pshufb .Lrol16(%rip),%xmm14
@@ -1188,9 +1188,9 @@
pslld $7,%xmm3
psrld $25,%xmm6
pxor %xmm3,%xmm6
-.byte 102,15,58,15,246,12
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,4
+ palignr $12,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $4,%xmm14,%xmm14
cmpq %rcx,%r8
jb .Lopen_sse_tail_192_rounds_and_x1hash
@@ -1375,9 +1375,9 @@
pslld $7,%xmm11
psrld $25,%xmm4
pxor %xmm11,%xmm4
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
+ palignr $4,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $12,%xmm12,%xmm12
paddd %xmm5,%xmm1
pxor %xmm1,%xmm13
pshufb .Lrol16(%rip),%xmm13
@@ -1396,9 +1396,9 @@
pslld $7,%xmm11
psrld $25,%xmm5
pxor %xmm11,%xmm5
-.byte 102,15,58,15,237,4
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,12
+ palignr $4,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $12,%xmm13,%xmm13
paddd %xmm6,%xmm2
pxor %xmm2,%xmm14
pshufb .Lrol16(%rip),%xmm14
@@ -1417,9 +1417,9 @@
pslld $7,%xmm11
psrld $25,%xmm6
pxor %xmm11,%xmm6
-.byte 102,15,58,15,246,4
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,12
+ palignr $4,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $12,%xmm14,%xmm14
movdqa 0+80(%rbp),%xmm11
movq 0+0+0(%rbp),%rax
movq %rax,%r15
@@ -1450,9 +1450,9 @@
pslld $7,%xmm9
psrld $25,%xmm7
pxor %xmm9,%xmm7
-.byte 102,15,58,15,255,4
-.byte 102,69,15,58,15,219,8
-.byte 102,69,15,58,15,255,12
+ palignr $4,%xmm7,%xmm7
+ palignr $8,%xmm11,%xmm11
+ palignr $12,%xmm15,%xmm15
movdqa 0+80(%rbp),%xmm9
movq 8+0+0(%rbp),%rax
movq %rax,%r9
@@ -1483,9 +1483,9 @@
pslld $7,%xmm11
psrld $25,%xmm4
pxor %xmm11,%xmm4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
+ palignr $12,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $4,%xmm12,%xmm12
paddd %xmm5,%xmm1
pxor %xmm1,%xmm13
pshufb .Lrol16(%rip),%xmm13
@@ -1504,9 +1504,9 @@
pslld $7,%xmm11
psrld $25,%xmm5
pxor %xmm11,%xmm5
-.byte 102,15,58,15,237,12
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,4
+ palignr $12,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $4,%xmm13,%xmm13
imulq %r12,%r9
addq %r10,%r15
adcq %rdx,%r9
@@ -1528,9 +1528,9 @@
pslld $7,%xmm11
psrld $25,%xmm6
pxor %xmm11,%xmm6
-.byte 102,15,58,15,246,12
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,4
+ palignr $12,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $4,%xmm14,%xmm14
movdqa 0+80(%rbp),%xmm11
movq %r13,%r10
movq %r14,%r11
@@ -1565,9 +1565,9 @@
pslld $7,%xmm9
psrld $25,%xmm7
pxor %xmm9,%xmm7
-.byte 102,15,58,15,255,12
-.byte 102,69,15,58,15,219,8
-.byte 102,69,15,58,15,255,4
+ palignr $12,%xmm7,%xmm7
+ palignr $8,%xmm11,%xmm11
+ palignr $4,%xmm15,%xmm15
movdqa 0+80(%rbp),%xmm9
addq $16,%r8
@@ -1714,7 +1714,7 @@
subq $1,%r8
jnz .Lopen_sse_tail_16_compose
-.byte 102,73,15,126,221
+ movq %xmm3,%r13
pextrq $1,%xmm3,%r14
pxor %xmm1,%xmm3
@@ -1894,9 +1894,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
+ palignr $4,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $12,%xmm12,%xmm12
paddd %xmm5,%xmm1
pxor %xmm1,%xmm13
pshufb .Lrol16(%rip),%xmm13
@@ -1915,9 +1915,9 @@
pslld $7,%xmm3
psrld $25,%xmm5
pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,4
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,12
+ palignr $4,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $12,%xmm13,%xmm13
paddd %xmm6,%xmm2
pxor %xmm2,%xmm14
pshufb .Lrol16(%rip),%xmm14
@@ -1936,9 +1936,9 @@
pslld $7,%xmm3
psrld $25,%xmm6
pxor %xmm3,%xmm6
-.byte 102,15,58,15,246,4
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,12
+ palignr $4,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $12,%xmm14,%xmm14
paddd %xmm4,%xmm0
pxor %xmm0,%xmm12
pshufb .Lrol16(%rip),%xmm12
@@ -1957,9 +1957,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
+ palignr $12,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $4,%xmm12,%xmm12
paddd %xmm5,%xmm1
pxor %xmm1,%xmm13
pshufb .Lrol16(%rip),%xmm13
@@ -1978,9 +1978,9 @@
pslld $7,%xmm3
psrld $25,%xmm5
pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,12
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,4
+ palignr $12,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $4,%xmm13,%xmm13
paddd %xmm6,%xmm2
pxor %xmm2,%xmm14
pshufb .Lrol16(%rip),%xmm14
@@ -1999,9 +1999,9 @@
pslld $7,%xmm3
psrld $25,%xmm6
pxor %xmm3,%xmm6
-.byte 102,15,58,15,246,12
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,4
+ palignr $12,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $4,%xmm14,%xmm14
decq %r10
jnz .Lopen_sse_128_rounds
@@ -2176,10 +2176,10 @@
pxor %xmm2,%xmm14
pxor %xmm1,%xmm13
pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
+ pshufb %xmm8,%xmm15
+ pshufb %xmm8,%xmm14
+ pshufb %xmm8,%xmm13
+ pshufb %xmm8,%xmm12
movdqa 0+80(%rbp),%xmm8
paddd %xmm15,%xmm11
paddd %xmm14,%xmm10
@@ -2215,10 +2215,10 @@
pxor %xmm2,%xmm14
pxor %xmm1,%xmm13
pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
+ pshufb %xmm8,%xmm15
+ pshufb %xmm8,%xmm14
+ pshufb %xmm8,%xmm13
+ pshufb %xmm8,%xmm12
movdqa 0+80(%rbp),%xmm8
paddd %xmm15,%xmm11
paddd %xmm14,%xmm10
@@ -2246,18 +2246,18 @@
pslld $32-25,%xmm4
pxor %xmm8,%xmm4
movdqa 0+80(%rbp),%xmm8
-.byte 102,15,58,15,255,4
-.byte 102,69,15,58,15,219,8
-.byte 102,69,15,58,15,255,12
-.byte 102,15,58,15,246,4
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,12
-.byte 102,15,58,15,237,4
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,12
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
+ palignr $4,%xmm7,%xmm7
+ palignr $8,%xmm11,%xmm11
+ palignr $12,%xmm15,%xmm15
+ palignr $4,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $12,%xmm14,%xmm14
+ palignr $4,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $12,%xmm13,%xmm13
+ palignr $4,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $12,%xmm12,%xmm12
movdqa %xmm8,0+80(%rbp)
movdqa .Lrol16(%rip),%xmm8
paddd %xmm7,%xmm3
@@ -2268,10 +2268,10 @@
pxor %xmm2,%xmm14
pxor %xmm1,%xmm13
pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
+ pshufb %xmm8,%xmm15
+ pshufb %xmm8,%xmm14
+ pshufb %xmm8,%xmm13
+ pshufb %xmm8,%xmm12
movdqa 0+80(%rbp),%xmm8
paddd %xmm15,%xmm11
paddd %xmm14,%xmm10
@@ -2307,10 +2307,10 @@
pxor %xmm2,%xmm14
pxor %xmm1,%xmm13
pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
+ pshufb %xmm8,%xmm15
+ pshufb %xmm8,%xmm14
+ pshufb %xmm8,%xmm13
+ pshufb %xmm8,%xmm12
movdqa 0+80(%rbp),%xmm8
paddd %xmm15,%xmm11
paddd %xmm14,%xmm10
@@ -2338,18 +2338,18 @@
pslld $32-25,%xmm4
pxor %xmm8,%xmm4
movdqa 0+80(%rbp),%xmm8
-.byte 102,15,58,15,255,12
-.byte 102,69,15,58,15,219,8
-.byte 102,69,15,58,15,255,4
-.byte 102,15,58,15,246,12
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,4
-.byte 102,15,58,15,237,12
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
+ palignr $12,%xmm7,%xmm7
+ palignr $8,%xmm11,%xmm11
+ palignr $4,%xmm15,%xmm15
+ palignr $12,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $4,%xmm14,%xmm14
+ palignr $12,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $4,%xmm13,%xmm13
+ palignr $12,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $4,%xmm12,%xmm12
decq %r10
jnz .Lseal_sse_init_rounds
@@ -2472,10 +2472,10 @@
pxor %xmm2,%xmm14
pxor %xmm1,%xmm13
pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
+ pshufb %xmm8,%xmm15
+ pshufb %xmm8,%xmm14
+ pshufb %xmm8,%xmm13
+ pshufb %xmm8,%xmm12
movdqa 0+80(%rbp),%xmm8
paddd %xmm15,%xmm11
paddd %xmm14,%xmm10
@@ -2524,10 +2524,10 @@
pxor %xmm2,%xmm14
pxor %xmm1,%xmm13
pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
+ pshufb %xmm8,%xmm15
+ pshufb %xmm8,%xmm14
+ pshufb %xmm8,%xmm13
+ pshufb %xmm8,%xmm12
movdqa 0+80(%rbp),%xmm8
paddd %xmm15,%xmm11
paddd %xmm14,%xmm10
@@ -2568,18 +2568,18 @@
imulq %r12,%r9
addq %r10,%r15
adcq %rdx,%r9
-.byte 102,15,58,15,255,4
-.byte 102,69,15,58,15,219,8
-.byte 102,69,15,58,15,255,12
-.byte 102,15,58,15,246,4
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,12
-.byte 102,15,58,15,237,4
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,12
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
+ palignr $4,%xmm7,%xmm7
+ palignr $8,%xmm11,%xmm11
+ palignr $12,%xmm15,%xmm15
+ palignr $4,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $12,%xmm14,%xmm14
+ palignr $4,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $12,%xmm13,%xmm13
+ palignr $4,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $12,%xmm12,%xmm12
movdqa %xmm8,0+80(%rbp)
movdqa .Lrol16(%rip),%xmm8
paddd %xmm7,%xmm3
@@ -2604,10 +2604,10 @@
adcq $0,%r12
pxor %xmm1,%xmm13
pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
+ pshufb %xmm8,%xmm15
+ pshufb %xmm8,%xmm14
+ pshufb %xmm8,%xmm13
+ pshufb %xmm8,%xmm12
movdqa 0+80(%rbp),%xmm8
paddd %xmm15,%xmm11
paddd %xmm14,%xmm10
@@ -2643,10 +2643,10 @@
pxor %xmm2,%xmm14
pxor %xmm1,%xmm13
pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
+ pshufb %xmm8,%xmm15
+ pshufb %xmm8,%xmm14
+ pshufb %xmm8,%xmm13
+ pshufb %xmm8,%xmm12
movdqa 0+80(%rbp),%xmm8
paddd %xmm15,%xmm11
paddd %xmm14,%xmm10
@@ -2674,18 +2674,18 @@
pslld $32-25,%xmm4
pxor %xmm8,%xmm4
movdqa 0+80(%rbp),%xmm8
-.byte 102,15,58,15,255,12
-.byte 102,69,15,58,15,219,8
-.byte 102,69,15,58,15,255,4
-.byte 102,15,58,15,246,12
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,4
-.byte 102,15,58,15,237,12
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
+ palignr $12,%xmm7,%xmm7
+ palignr $8,%xmm11,%xmm11
+ palignr $4,%xmm15,%xmm15
+ palignr $12,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $4,%xmm14,%xmm14
+ palignr $12,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $4,%xmm13,%xmm13
+ palignr $12,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $4,%xmm12,%xmm12
leaq 16(%rdi),%rdi
decq %r8
@@ -2898,9 +2898,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
+ palignr $4,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $12,%xmm12,%xmm12
paddd %xmm4,%xmm0
pxor %xmm0,%xmm12
pshufb .Lrol16(%rip),%xmm12
@@ -2919,9 +2919,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
+ palignr $12,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $4,%xmm12,%xmm12
addq 0+0(%rdi),%r10
adcq 8+0(%rdi),%r11
adcq $1,%r12
@@ -3051,9 +3051,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
+ palignr $4,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $12,%xmm12,%xmm12
paddd %xmm5,%xmm1
pxor %xmm1,%xmm13
pshufb .Lrol16(%rip),%xmm13
@@ -3072,9 +3072,9 @@
pslld $7,%xmm3
psrld $25,%xmm5
pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,4
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,12
+ palignr $4,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $12,%xmm13,%xmm13
addq 0+0(%rdi),%r10
adcq 8+0(%rdi),%r11
adcq $1,%r12
@@ -3133,9 +3133,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
+ palignr $12,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $4,%xmm12,%xmm12
paddd %xmm5,%xmm1
pxor %xmm1,%xmm13
pshufb .Lrol16(%rip),%xmm13
@@ -3154,9 +3154,9 @@
pslld $7,%xmm3
psrld $25,%xmm5
pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,12
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,4
+ palignr $12,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $4,%xmm13,%xmm13
leaq 16(%rdi),%rdi
decq %rcx
@@ -3271,9 +3271,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
+ palignr $4,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $12,%xmm12,%xmm12
paddd %xmm5,%xmm1
pxor %xmm1,%xmm13
pshufb .Lrol16(%rip),%xmm13
@@ -3292,9 +3292,9 @@
pslld $7,%xmm3
psrld $25,%xmm5
pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,4
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,12
+ palignr $4,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $12,%xmm13,%xmm13
paddd %xmm6,%xmm2
pxor %xmm2,%xmm14
pshufb .Lrol16(%rip),%xmm14
@@ -3313,9 +3313,9 @@
pslld $7,%xmm3
psrld $25,%xmm6
pxor %xmm3,%xmm6
-.byte 102,15,58,15,246,4
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,12
+ palignr $4,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $12,%xmm14,%xmm14
addq 0+0(%rdi),%r10
adcq 8+0(%rdi),%r11
adcq $1,%r12
@@ -3374,9 +3374,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
+ palignr $12,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $4,%xmm12,%xmm12
paddd %xmm5,%xmm1
pxor %xmm1,%xmm13
pshufb .Lrol16(%rip),%xmm13
@@ -3395,9 +3395,9 @@
pslld $7,%xmm3
psrld $25,%xmm5
pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,12
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,4
+ palignr $12,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $4,%xmm13,%xmm13
paddd %xmm6,%xmm2
pxor %xmm2,%xmm14
pshufb .Lrol16(%rip),%xmm14
@@ -3416,9 +3416,9 @@
pslld $7,%xmm3
psrld $25,%xmm6
pxor %xmm3,%xmm6
-.byte 102,15,58,15,246,12
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,4
+ palignr $12,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $4,%xmm14,%xmm14
leaq 16(%rdi),%rdi
decq %rcx
@@ -3670,7 +3670,7 @@
-.byte 102,77,15,126,253
+ movq %xmm15,%r13
pextrq $1,%xmm15,%r14
addq %r13,%r10
adcq %r14,%r11
@@ -3786,7 +3786,7 @@
leaq .Land_masks(%rip),%r15
shlq $4,%rbx
pand -16(%r15,%rbx,1),%xmm15
-.byte 102,77,15,126,253
+ movq %xmm15,%r13
pextrq $1,%xmm15,%r14
addq %r13,%r10
adcq %r14,%r11
@@ -3955,9 +3955,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
+ palignr $4,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $12,%xmm12,%xmm12
paddd %xmm5,%xmm1
pxor %xmm1,%xmm13
pshufb .Lrol16(%rip),%xmm13
@@ -3976,9 +3976,9 @@
pslld $7,%xmm3
psrld $25,%xmm5
pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,4
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,12
+ palignr $4,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $12,%xmm13,%xmm13
paddd %xmm6,%xmm2
pxor %xmm2,%xmm14
pshufb .Lrol16(%rip),%xmm14
@@ -3997,9 +3997,9 @@
pslld $7,%xmm3
psrld $25,%xmm6
pxor %xmm3,%xmm6
-.byte 102,15,58,15,246,4
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,12
+ palignr $4,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $12,%xmm14,%xmm14
paddd %xmm4,%xmm0
pxor %xmm0,%xmm12
pshufb .Lrol16(%rip),%xmm12
@@ -4018,9 +4018,9 @@
pslld $7,%xmm3
psrld $25,%xmm4
pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
+ palignr $12,%xmm4,%xmm4
+ palignr $8,%xmm8,%xmm8
+ palignr $4,%xmm12,%xmm12
paddd %xmm5,%xmm1
pxor %xmm1,%xmm13
pshufb .Lrol16(%rip),%xmm13
@@ -4039,9 +4039,9 @@
pslld $7,%xmm3
psrld $25,%xmm5
pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,12
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,4
+ palignr $12,%xmm5,%xmm5
+ palignr $8,%xmm9,%xmm9
+ palignr $4,%xmm13,%xmm13
paddd %xmm6,%xmm2
pxor %xmm2,%xmm14
pshufb .Lrol16(%rip),%xmm14
@@ -4060,9 +4060,9 @@
pslld $7,%xmm3
psrld $25,%xmm6
pxor %xmm3,%xmm6
-.byte 102,15,58,15,246,12
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,4
+ palignr $12,%xmm6,%xmm6
+ palignr $8,%xmm10,%xmm10
+ palignr $4,%xmm14,%xmm14
decq %r10
jnz .Lseal_sse_128_rounds
diff --git a/gen/crypto/chacha20_poly1305_x86_64-win.asm b/gen/crypto/chacha20_poly1305_x86_64-win.asm
index 25c69ef..7ff65db 100644
--- a/gen/crypto/chacha20_poly1305_x86_64-win.asm
+++ b/gen/crypto/chacha20_poly1305_x86_64-win.asm
@@ -308,9 +308,9 @@
pslld xmm3,7
psrld xmm4,25
pxor xmm4,xmm3
-DB 102,15,58,15,228,4
-DB 102,69,15,58,15,192,8
-DB 102,69,15,58,15,228,12
+ palignr xmm4,xmm4,4
+ palignr xmm8,xmm8,8
+ palignr xmm12,xmm12,12
paddd xmm0,xmm4
pxor xmm12,xmm0
pshufb xmm12,XMMWORD[$L$rol16]
@@ -329,9 +329,9 @@
pslld xmm3,7
psrld xmm4,25
pxor xmm4,xmm3
-DB 102,15,58,15,228,12
-DB 102,69,15,58,15,192,8
-DB 102,69,15,58,15,228,4
+ palignr xmm4,xmm4,12
+ palignr xmm8,xmm8,8
+ palignr xmm12,xmm12,4
dec r10
jne NEAR $L$open_sse_init_rounds
@@ -389,10 +389,10 @@
pxor xmm14,xmm2
pxor xmm13,xmm1
pxor xmm12,xmm0
-DB 102,69,15,56,0,248
-DB 102,69,15,56,0,240
-DB 102,69,15,56,0,232
-DB 102,69,15,56,0,224
+ pshufb xmm15,xmm8
+ pshufb xmm14,xmm8
+ pshufb xmm13,xmm8
+ pshufb xmm12,xmm8
movdqa xmm8,XMMWORD[((160+80))+rbp]
paddd xmm11,xmm15
paddd xmm10,xmm14
@@ -443,10 +443,10 @@
pxor xmm14,xmm2
pxor xmm13,xmm1
pxor xmm12,xmm0
-DB 102,69,15,56,0,248
-DB 102,69,15,56,0,240
-DB 102,69,15,56,0,232
-DB 102,69,15,56,0,224
+ pshufb xmm15,xmm8
+ pshufb xmm14,xmm8
+ pshufb xmm13,xmm8
+ pshufb xmm12,xmm8
movdqa xmm8,XMMWORD[((160+80))+rbp]
paddd xmm11,xmm15
paddd xmm10,xmm14
@@ -487,18 +487,18 @@
imul r9,r12
add r15,r10
adc r9,rdx
-DB 102,15,58,15,255,4
-DB 102,69,15,58,15,219,8
-DB 102,69,15,58,15,255,12
-DB 102,15,58,15,246,4
-DB 102,69,15,58,15,210,8
-DB 102,69,15,58,15,246,12
-DB 102,15,58,15,237,4
-DB 102,69,15,58,15,201,8
-DB 102,69,15,58,15,237,12
-DB 102,15,58,15,228,4
-DB 102,69,15,58,15,192,8
-DB 102,69,15,58,15,228,12
+ palignr xmm7,xmm7,4
+ palignr xmm11,xmm11,8
+ palignr xmm15,xmm15,12
+ palignr xmm6,xmm6,4
+ palignr xmm10,xmm10,8
+ palignr xmm14,xmm14,12
+ palignr xmm5,xmm5,4
+ palignr xmm9,xmm9,8
+ palignr xmm13,xmm13,12
+ palignr xmm4,xmm4,4
+ palignr xmm8,xmm8,8
+ palignr xmm12,xmm12,12
movdqa XMMWORD[(160+80)+rbp],xmm8
movdqa xmm8,XMMWORD[$L$rol16]
paddd xmm3,xmm7
@@ -523,10 +523,10 @@
adc r12,0
pxor xmm13,xmm1
pxor xmm12,xmm0
-DB 102,69,15,56,0,248
-DB 102,69,15,56,0,240
-DB 102,69,15,56,0,232
-DB 102,69,15,56,0,224
+ pshufb xmm15,xmm8
+ pshufb xmm14,xmm8
+ pshufb xmm13,xmm8
+ pshufb xmm12,xmm8
movdqa xmm8,XMMWORD[((160+80))+rbp]
paddd xmm11,xmm15
paddd xmm10,xmm14
@@ -562,10 +562,10 @@
pxor xmm14,xmm2
pxor xmm13,xmm1
pxor xmm12,xmm0
-DB 102,69,15,56,0,248
-DB 102,69,15,56,0,240
-DB 102,69,15,56,0,232
-DB 102,69,15,56,0,224
+ pshufb xmm15,xmm8
+ pshufb xmm14,xmm8
+ pshufb xmm13,xmm8
+ pshufb xmm12,xmm8
movdqa xmm8,XMMWORD[((160+80))+rbp]
paddd xmm11,xmm15
paddd xmm10,xmm14
@@ -593,18 +593,18 @@
pslld xmm4,32-25
pxor xmm4,xmm8
movdqa xmm8,XMMWORD[((160+80))+rbp]
-DB 102,15,58,15,255,12
-DB 102,69,15,58,15,219,8
-DB 102,69,15,58,15,255,4
-DB 102,15,58,15,246,12
-DB 102,69,15,58,15,210,8
-DB 102,69,15,58,15,246,4
-DB 102,15,58,15,237,12
-DB 102,69,15,58,15,201,8
-DB 102,69,15,58,15,237,4
-DB 102,15,58,15,228,12
-DB 102,69,15,58,15,192,8
-DB 102,69,15,58,15,228,4
+ palignr xmm7,xmm7,12
+ palignr xmm11,xmm11,8
+ palignr xmm15,xmm15,4
+ palignr xmm6,xmm6,12
+ palignr xmm10,xmm10,8
+ palignr xmm14,xmm14,4
+ palignr xmm5,xmm5,12
+ palignr xmm9,xmm9,8
+ palignr xmm13,xmm13,4
+ palignr xmm4,xmm4,12
+ palignr xmm8,xmm8,8
+ palignr xmm12,xmm12,4
dec rcx
jge NEAR $L$open_sse_main_loop_rounds
@@ -806,9 +806,9 @@
pslld xmm3,7
psrld xmm4,25
pxor xmm4,xmm3
-DB 102,15,58,15,228,4
-DB 102,69,15,58,15,192,8
-DB 102,69,15,58,15,228,12
+ palignr xmm4,xmm4,4
+ palignr xmm8,xmm8,8
+ palignr xmm12,xmm12,12
paddd xmm0,xmm4
pxor xmm12,xmm0
pshufb xmm12,XMMWORD[$L$rol16]
@@ -827,9 +827,9 @@
pslld xmm3,7
psrld xmm4,25
pxor xmm4,xmm3
-DB 102,15,58,15,228,12
-DB 102,69,15,58,15,192,8
-DB 102,69,15,58,15,228,4
+ palignr xmm4,xmm4,12
+ palignr xmm8,xmm8,8
+ palignr xmm12,xmm12,4
cmp rcx,16
jae NEAR $L$open_sse_tail_64_rounds_and_x1hash
@@ -921,9 +921,9 @@
pslld xmm3,7
psrld xmm4,25
pxor xmm4,xmm3
-DB 102,15,58,15,228,4
-DB 102,69,15,58,15,192,8
-DB 102,69,15,58,15,228,12
+ palignr xmm4,xmm4,4
+ palignr xmm8,xmm8,8
+ palignr xmm12,xmm12,12
paddd xmm1,xmm5
pxor xmm13,xmm1
pshufb xmm13,XMMWORD[$L$rol16]
@@ -942,9 +942,9 @@
pslld xmm3,7
psrld xmm5,25
pxor xmm5,xmm3
-DB 102,15,58,15,237,4
-DB 102,69,15,58,15,201,8
-DB 102,69,15,58,15,237,12
+ palignr xmm5,xmm5,4
+ palignr xmm9,xmm9,8
+ palignr xmm13,xmm13,12
paddd xmm0,xmm4
pxor xmm12,xmm0
pshufb xmm12,XMMWORD[$L$rol16]
@@ -963,9 +963,9 @@
pslld xmm3,7
psrld xmm4,25
pxor xmm4,xmm3
-DB 102,15,58,15,228,12
-DB 102,69,15,58,15,192,8
-DB 102,69,15,58,15,228,4
+ palignr xmm4,xmm4,12
+ palignr xmm8,xmm8,8
+ palignr xmm12,xmm12,4
paddd xmm1,xmm5
pxor xmm13,xmm1
pshufb xmm13,XMMWORD[$L$rol16]
@@ -984,9 +984,9 @@
pslld xmm3,7
psrld xmm5,25
pxor xmm5,xmm3
-DB 102,15,58,15,237,12
-DB 102,69,15,58,15,201,8
-DB 102,69,15,58,15,237,4
+ palignr xmm5,xmm5,12
+ palignr xmm9,xmm9,8
+ palignr xmm13,xmm13,4
cmp r8,rcx
jb NEAR $L$open_sse_tail_128_rounds_and_x1hash
@@ -1106,9 +1106,9 @@
pslld xmm3,7
psrld xmm4,25
pxor xmm4,xmm3
-DB 102,15,58,15,228,4
-DB 102,69,15,58,15,192,8
-DB 102,69,15,58,15,228,12
+ palignr xmm4,xmm4,4
+ palignr xmm8,xmm8,8
+ palignr xmm12,xmm12,12
paddd xmm1,xmm5
pxor xmm13,xmm1
pshufb xmm13,XMMWORD[$L$rol16]
@@ -1127,9 +1127,9 @@
pslld xmm3,7
psrld xmm5,25
pxor xmm5,xmm3
-DB 102,15,58,15,237,4
-DB 102,69,15,58,15,201,8
-DB 102,69,15,58,15,237,12
+ palignr xmm5,xmm5,4
+ palignr xmm9,xmm9,8
+ palignr xmm13,xmm13,12
paddd xmm2,xmm6
pxor xmm14,xmm2
pshufb xmm14,XMMWORD[$L$rol16]
@@ -1148,9 +1148,9 @@
pslld xmm3,7
psrld xmm6,25
pxor xmm6,xmm3
-DB 102,15,58,15,246,4
-DB 102,69,15,58,15,210,8
-DB 102,69,15,58,15,246,12
+ palignr xmm6,xmm6,4
+ palignr xmm10,xmm10,8
+ palignr xmm14,xmm14,12
paddd xmm0,xmm4
pxor xmm12,xmm0
pshufb xmm12,XMMWORD[$L$rol16]
@@ -1169,9 +1169,9 @@
pslld xmm3,7
psrld xmm4,25
pxor xmm4,xmm3
-DB 102,15,58,15,228,12
-DB 102,69,15,58,15,192,8
-DB 102,69,15,58,15,228,4
+ palignr xmm4,xmm4,12
+ palignr xmm8,xmm8,8
+ palignr xmm12,xmm12,4
paddd xmm1,xmm5
pxor xmm13,xmm1
pshufb xmm13,XMMWORD[$L$rol16]
@@ -1190,9 +1190,9 @@
pslld xmm3,7
psrld xmm5,25
pxor xmm5,xmm3
-DB 102,15,58,15,237,12
-DB 102,69,15,58,15,201,8
-DB 102,69,15,58,15,237,4
+ palignr xmm5,xmm5,12
+ palignr xmm9,xmm9,8
+ palignr xmm13,xmm13,4
paddd xmm2,xmm6
pxor xmm14,xmm2
pshufb xmm14,XMMWORD[$L$rol16]
@@ -1211,9 +1211,9 @@
pslld xmm3,7
psrld xmm6,25
pxor xmm6,xmm3
-DB 102,15,58,15,246,12
-DB 102,69,15,58,15,210,8
-DB 102,69,15,58,15,246,4
+ palignr xmm6,xmm6,12
+ palignr xmm10,xmm10,8
+ palignr xmm14,xmm14,4
cmp r8,rcx
jb NEAR $L$open_sse_tail_192_rounds_and_x1hash
@@ -1398,9 +1398,9 @@
pslld xmm11,7
psrld xmm4,25
pxor xmm4,xmm11
-DB 102,15,58,15,228,4
-DB 102,69,15,58,15,192,8
-DB 102,69,15,58,15,228,12
+ palignr xmm4,xmm4,4
+ palignr xmm8,xmm8,8
+ palignr xmm12,xmm12,12
paddd xmm1,xmm5
pxor xmm13,xmm1
pshufb xmm13,XMMWORD[$L$rol16]
@@ -1419,9 +1419,9 @@
pslld xmm11,7
psrld xmm5,25
pxor xmm5,xmm11
-DB 102,15,58,15,237,4
-DB 102,69,15,58,15,201,8
-DB 102,69,15,58,15,237,12
+ palignr xmm5,xmm5,4
+ palignr xmm9,xmm9,8
+ palignr xmm13,xmm13,12
paddd xmm2,xmm6
pxor xmm14,xmm2
pshufb xmm14,XMMWORD[$L$rol16]
@@ -1440,9 +1440,9 @@
pslld xmm11,7
psrld xmm6,25
pxor xmm6,xmm11
-DB 102,15,58,15,246,4
-DB 102,69,15,58,15,210,8
-DB 102,69,15,58,15,246,12
+ palignr xmm6,xmm6,4
+ palignr xmm10,xmm10,8
+ palignr xmm14,xmm14,12
movdqa xmm11,XMMWORD[((160+80))+rbp]
mov rax,QWORD[((0+160+0))+rbp]
mov r15,rax
@@ -1473,9 +1473,9 @@
pslld xmm9,7
psrld xmm7,25
pxor xmm7,xmm9
-DB 102,15,58,15,255,4
-DB 102,69,15,58,15,219,8
-DB 102,69,15,58,15,255,12
+ palignr xmm7,xmm7,4
+ palignr xmm11,xmm11,8
+ palignr xmm15,xmm15,12
movdqa xmm9,XMMWORD[((160+80))+rbp]
mov rax,QWORD[((8+160+0))+rbp]
mov r9,rax
@@ -1506,9 +1506,9 @@
pslld xmm11,7
psrld xmm4,25
pxor xmm4,xmm11
-DB 102,15,58,15,228,12
-DB 102,69,15,58,15,192,8
-DB 102,69,15,58,15,228,4
+ palignr xmm4,xmm4,12
+ palignr xmm8,xmm8,8
+ palignr xmm12,xmm12,4
paddd xmm1,xmm5
pxor xmm13,xmm1
pshufb xmm13,XMMWORD[$L$rol16]
@@ -1527,9 +1527,9 @@
pslld xmm11,7
psrld xmm5,25
pxor xmm5,xmm11
-DB 102,15,58,15,237,12
-DB 102,69,15,58,15,201,8
-DB 102,69,15,58,15,237,4
+ palignr xmm5,xmm5,12
+ palignr xmm9,xmm9,8
+ palignr xmm13,xmm13,4
imul r9,r12
add r15,r10
adc r9,rdx
@@ -1551,9 +1551,9 @@
pslld xmm11,7
psrld xmm6,25
pxor xmm6,xmm11
-DB 102,15,58,15,246,12
-DB 102,69,15,58,15,210,8
-DB 102,69,15,58,15,246,4
+ palignr xmm6,xmm6,12
+ palignr xmm10,xmm10,8
+ palignr xmm14,xmm14,4
movdqa xmm11,XMMWORD[((160+80))+rbp]
mov r10,r13
mov r11,r14
@@ -1588,9 +1588,9 @@
pslld xmm9,7
psrld xmm7,25
pxor xmm7,xmm9
-DB 102,15,58,15,255,12
-DB 102,69,15,58,15,219,8
-DB 102,69,15,58,15,255,4
+ palignr xmm7,xmm7,12
+ palignr xmm11,xmm11,8
+ palignr xmm15,xmm15,4
movdqa xmm9,XMMWORD[((160+80))+rbp]
add r8,16
@@ -1737,7 +1737,7 @@
sub r8,1
jnz NEAR $L$open_sse_tail_16_compose
-DB 102,73,15,126,221
+ movq r13,xmm3
pextrq r14,xmm3,1
pxor xmm3,xmm1
@@ -1923,9 +1923,9 @@
pslld xmm3,7
psrld xmm4,25
pxor xmm4,xmm3
-DB 102,15,58,15,228,4
-DB 102,69,15,58,15,192,8
-DB 102,69,15,58,15,228,12
+ palignr xmm4,xmm4,4
+ palignr xmm8,xmm8,8
+ palignr xmm12,xmm12,12
paddd xmm1,xmm5
pxor xmm13,xmm1
pshufb xmm13,XMMWORD[$L$rol16]
@@ -1944,9 +1944,9 @@
pslld xmm3,7
psrld xmm5,25
pxor xmm5,xmm3
-DB 102,15,58,15,237,4
-DB 102,69,15,58,15,201,8
-DB 102,69,15,58,15,237,12
+ palignr xmm5,xmm5,4
+ palignr xmm9,xmm9,8
+ palignr xmm13,xmm13,12
paddd xmm2,xmm6
pxor xmm14,xmm2
pshufb xmm14,XMMWORD[$L$rol16]
@@ -1965,9 +1965,9 @@
pslld xmm3,7
psrld xmm6,25
pxor xmm6,xmm3
-DB 102,15,58,15,246,4
-DB 102,69,15,58,15,210,8
-DB 102,69,15,58,15,246,12
+ palignr xmm6,xmm6,4
+ palignr xmm10,xmm10,8
+ palignr xmm14,xmm14,12
paddd xmm0,xmm4
pxor xmm12,xmm0
pshufb xmm12,XMMWORD[$L$rol16]
@@ -1986,9 +1986,9 @@
pslld xmm3,7
psrld xmm4,25
pxor xmm4,xmm3
-DB 102,15,58,15,228,12
-DB 102,69,15,58,15,192,8
-DB 102,69,15,58,15,228,4
+ palignr xmm4,xmm4,12
+ palignr xmm8,xmm8,8
+ palignr xmm12,xmm12,4
paddd xmm1,xmm5
pxor xmm13,xmm1
pshufb xmm13,XMMWORD[$L$rol16]
@@ -2007,9 +2007,9 @@
pslld xmm3,7
psrld xmm5,25
pxor xmm5,xmm3
-DB 102,15,58,15,237,12
-DB 102,69,15,58,15,201,8
-DB 102,69,15,58,15,237,4
+ palignr xmm5,xmm5,12
+ palignr xmm9,xmm9,8
+ palignr xmm13,xmm13,4
paddd xmm2,xmm6
pxor xmm14,xmm2
pshufb xmm14,XMMWORD[$L$rol16]
@@ -2028,9 +2028,9 @@
pslld xmm3,7
psrld xmm6,25
pxor xmm6,xmm3
-DB 102,15,58,15,246,12
-DB 102,69,15,58,15,210,8
-DB 102,69,15,58,15,246,4
+ palignr xmm6,xmm6,12
+ palignr xmm10,xmm10,8
+ palignr xmm14,xmm14,4
dec r10
jnz NEAR $L$open_sse_128_rounds
@@ -2220,10 +2220,10 @@
pxor xmm14,xmm2
pxor xmm13,xmm1
pxor xmm12,xmm0
-DB 102,69,15,56,0,248
-DB 102,69,15,56,0,240
-DB 102,69,15,56,0,232
-DB 102,69,15,56,0,224
+ pshufb xmm15,xmm8
+ pshufb xmm14,xmm8
+ pshufb xmm13,xmm8
+ pshufb xmm12,xmm8
movdqa xmm8,XMMWORD[((160+80))+rbp]
paddd xmm11,xmm15
paddd xmm10,xmm14
@@ -2259,10 +2259,10 @@
pxor xmm14,xmm2
pxor xmm13,xmm1
pxor xmm12,xmm0
-DB 102,69,15,56,0,248
-DB 102,69,15,56,0,240
-DB 102,69,15,56,0,232
-DB 102,69,15,56,0,224
+ pshufb xmm15,xmm8
+ pshufb xmm14,xmm8
+ pshufb xmm13,xmm8
+ pshufb xmm12,xmm8
movdqa xmm8,XMMWORD[((160+80))+rbp]
paddd xmm11,xmm15
paddd xmm10,xmm14
@@ -2290,18 +2290,18 @@
pslld xmm4,32-25
pxor xmm4,xmm8
movdqa xmm8,XMMWORD[((160+80))+rbp]
-DB 102,15,58,15,255,4
-DB 102,69,15,58,15,219,8
-DB 102,69,15,58,15,255,12
-DB 102,15,58,15,246,4
-DB 102,69,15,58,15,210,8
-DB 102,69,15,58,15,246,12
-DB 102,15,58,15,237,4
-DB 102,69,15,58,15,201,8
-DB 102,69,15,58,15,237,12
-DB 102,15,58,15,228,4
-DB 102,69,15,58,15,192,8
-DB 102,69,15,58,15,228,12
+ palignr xmm7,xmm7,4
+ palignr xmm11,xmm11,8
+ palignr xmm15,xmm15,12
+ palignr xmm6,xmm6,4
+ palignr xmm10,xmm10,8
+ palignr xmm14,xmm14,12
+ palignr xmm5,xmm5,4
+ palignr xmm9,xmm9,8
+ palignr xmm13,xmm13,12
+ palignr xmm4,xmm4,4
+ palignr xmm8,xmm8,8
+ palignr xmm12,xmm12,12
movdqa XMMWORD[(160+80)+rbp],xmm8
movdqa xmm8,XMMWORD[$L$rol16]
paddd xmm3,xmm7
@@ -2312,10 +2312,10 @@
pxor xmm14,xmm2
pxor xmm13,xmm1
pxor xmm12,xmm0
-DB 102,69,15,56,0,248
-DB 102,69,15,56,0,240
-DB 102,69,15,56,0,232
-DB 102,69,15,56,0,224
+ pshufb xmm15,xmm8
+ pshufb xmm14,xmm8
+ pshufb xmm13,xmm8
+ pshufb xmm12,xmm8
movdqa xmm8,XMMWORD[((160+80))+rbp]
paddd xmm11,xmm15
paddd xmm10,xmm14
@@ -2351,10 +2351,10 @@
pxor xmm14,xmm2
pxor xmm13,xmm1
pxor xmm12,xmm0
-DB 102,69,15,56,0,248
-DB 102,69,15,56,0,240
-DB 102,69,15,56,0,232
-DB 102,69,15,56,0,224
+ pshufb xmm15,xmm8
+ pshufb xmm14,xmm8
+ pshufb xmm13,xmm8
+ pshufb xmm12,xmm8
movdqa xmm8,XMMWORD[((160+80))+rbp]
paddd xmm11,xmm15
paddd xmm10,xmm14
@@ -2382,18 +2382,18 @@
pslld xmm4,32-25
pxor xmm4,xmm8
movdqa xmm8,XMMWORD[((160+80))+rbp]
-DB 102,15,58,15,255,12
-DB 102,69,15,58,15,219,8
-DB 102,69,15,58,15,255,4
-DB 102,15,58,15,246,12
-DB 102,69,15,58,15,210,8
-DB 102,69,15,58,15,246,4
-DB 102,15,58,15,237,12
-DB 102,69,15,58,15,201,8
-DB 102,69,15,58,15,237,4
-DB 102,15,58,15,228,12
-DB 102,69,15,58,15,192,8
-DB 102,69,15,58,15,228,4
+ palignr xmm7,xmm7,12
+ palignr xmm11,xmm11,8
+ palignr xmm15,xmm15,4
+ palignr xmm6,xmm6,12
+ palignr xmm10,xmm10,8
+ palignr xmm14,xmm14,4
+ palignr xmm5,xmm5,12
+ palignr xmm9,xmm9,8
+ palignr xmm13,xmm13,4
+ palignr xmm4,xmm4,12
+ palignr xmm8,xmm8,8
+ palignr xmm12,xmm12,4
dec r10
jnz NEAR $L$seal_sse_init_rounds
@@ -2516,10 +2516,10 @@
pxor xmm14,xmm2
pxor xmm13,xmm1
pxor xmm12,xmm0
-DB 102,69,15,56,0,248
-DB 102,69,15,56,0,240
-DB 102,69,15,56,0,232
-DB 102,69,15,56,0,224
+ pshufb xmm15,xmm8
+ pshufb xmm14,xmm8
+ pshufb xmm13,xmm8
+ pshufb xmm12,xmm8
movdqa xmm8,XMMWORD[((160+80))+rbp]
paddd xmm11,xmm15
paddd xmm10,xmm14
@@ -2568,10 +2568,10 @@
pxor xmm14,xmm2
pxor xmm13,xmm1
pxor xmm12,xmm0
-DB 102,69,15,56,0,248
-DB 102,69,15,56,0,240
-DB 102,69,15,56,0,232
-DB 102,69,15,56,0,224
+ pshufb xmm15,xmm8
+ pshufb xmm14,xmm8
+ pshufb xmm13,xmm8
+ pshufb xmm12,xmm8
movdqa xmm8,XMMWORD[((160+80))+rbp]
paddd xmm11,xmm15
paddd xmm10,xmm14
@@ -2612,18 +2612,18 @@
imul r9,r12
add r15,r10
adc r9,rdx
-DB 102,15,58,15,255,4
-DB 102,69,15,58,15,219,8
-DB 102,69,15,58,15,255,12
-DB 102,15,58,15,246,4
-DB 102,69,15,58,15,210,8
-DB 102,69,15,58,15,246,12
-DB 102,15,58,15,237,4
-DB 102,69,15,58,15,201,8
-DB 102,69,15,58,15,237,12
-DB 102,15,58,15,228,4
-DB 102,69,15,58,15,192,8
-DB 102,69,15,58,15,228,12
+ palignr xmm7,xmm7,4
+ palignr xmm11,xmm11,8
+ palignr xmm15,xmm15,12
+ palignr xmm6,xmm6,4
+ palignr xmm10,xmm10,8
+ palignr xmm14,xmm14,12
+ palignr xmm5,xmm5,4
+ palignr xmm9,xmm9,8
+ palignr xmm13,xmm13,12
+ palignr xmm4,xmm4,4
+ palignr xmm8,xmm8,8
+ palignr xmm12,xmm12,12
movdqa XMMWORD[(160+80)+rbp],xmm8
movdqa xmm8,XMMWORD[$L$rol16]
paddd xmm3,xmm7
@@ -2648,10 +2648,10 @@
adc r12,0
pxor xmm13,xmm1
pxor xmm12,xmm0
-DB 102,69,15,56,0,248
-DB 102,69,15,56,0,240
-DB 102,69,15,56,0,232
-DB 102,69,15,56,0,224
+ pshufb xmm15,xmm8
+ pshufb xmm14,xmm8
+ pshufb xmm13,xmm8
+ pshufb xmm12,xmm8
movdqa xmm8,XMMWORD[((160+80))+rbp]
paddd xmm11,xmm15
paddd xmm10,xmm14
@@ -2687,10 +2687,10 @@
pxor xmm14,xmm2
pxor xmm13,xmm1
pxor xmm12,xmm0
-DB 102,69,15,56,0,248
-DB 102,69,15,56,0,240
-DB 102,69,15,56,0,232
-DB 102,69,15,56,0,224
+ pshufb xmm15,xmm8
+ pshufb xmm14,xmm8
+ pshufb xmm13,xmm8
+ pshufb xmm12,xmm8
movdqa xmm8,XMMWORD[((160+80))+rbp]
paddd xmm11,xmm15
paddd xmm10,xmm14
@@ -2718,18 +2718,18 @@
pslld xmm4,32-25
pxor xmm4,xmm8
movdqa xmm8,XMMWORD[((160+80))+rbp]
-DB 102,15,58,15,255,12
-DB 102,69,15,58,15,219,8
-DB 102,69,15,58,15,255,4
-DB 102,15,58,15,246,12
-DB 102,69,15,58,15,210,8
-DB 102,69,15,58,15,246,4
-DB 102,15,58,15,237,12
-DB 102,69,15,58,15,201,8
-DB 102,69,15,58,15,237,4
-DB 102,15,58,15,228,12
-DB 102,69,15,58,15,192,8
-DB 102,69,15,58,15,228,4
+ palignr xmm7,xmm7,12
+ palignr xmm11,xmm11,8
+ palignr xmm15,xmm15,4
+ palignr xmm6,xmm6,12
+ palignr xmm10,xmm10,8
+ palignr xmm14,xmm14,4
+ palignr xmm5,xmm5,12
+ palignr xmm9,xmm9,8
+ palignr xmm13,xmm13,4
+ palignr xmm4,xmm4,12
+ palignr xmm8,xmm8,8
+ palignr xmm12,xmm12,4
lea rdi,[16+rdi]
dec r8
@@ -2942,9 +2942,9 @@
pslld xmm3,7
psrld xmm4,25
pxor xmm4,xmm3
-DB 102,15,58,15,228,4
-DB 102,69,15,58,15,192,8
-DB 102,69,15,58,15,228,12
+ palignr xmm4,xmm4,4
+ palignr xmm8,xmm8,8
+ palignr xmm12,xmm12,12
paddd xmm0,xmm4
pxor xmm12,xmm0
pshufb xmm12,XMMWORD[$L$rol16]
@@ -2963,9 +2963,9 @@
pslld xmm3,7
psrld xmm4,25
pxor xmm4,xmm3
-DB 102,15,58,15,228,12
-DB 102,69,15,58,15,192,8
-DB 102,69,15,58,15,228,4
+ palignr xmm4,xmm4,12
+ palignr xmm8,xmm8,8
+ palignr xmm12,xmm12,4
add r10,QWORD[((0+0))+rdi]
adc r11,QWORD[((8+0))+rdi]
adc r12,1
@@ -3095,9 +3095,9 @@
pslld xmm3,7
psrld xmm4,25
pxor xmm4,xmm3
-DB 102,15,58,15,228,4
-DB 102,69,15,58,15,192,8
-DB 102,69,15,58,15,228,12
+ palignr xmm4,xmm4,4
+ palignr xmm8,xmm8,8
+ palignr xmm12,xmm12,12
paddd xmm1,xmm5
pxor xmm13,xmm1
pshufb xmm13,XMMWORD[$L$rol16]
@@ -3116,9 +3116,9 @@
pslld xmm3,7
psrld xmm5,25
pxor xmm5,xmm3
-DB 102,15,58,15,237,4
-DB 102,69,15,58,15,201,8
-DB 102,69,15,58,15,237,12
+ palignr xmm5,xmm5,4
+ palignr xmm9,xmm9,8
+ palignr xmm13,xmm13,12
add r10,QWORD[((0+0))+rdi]
adc r11,QWORD[((8+0))+rdi]
adc r12,1
@@ -3177,9 +3177,9 @@
pslld xmm3,7
psrld xmm4,25
pxor xmm4,xmm3
-DB 102,15,58,15,228,12
-DB 102,69,15,58,15,192,8
-DB 102,69,15,58,15,228,4
+ palignr xmm4,xmm4,12
+ palignr xmm8,xmm8,8
+ palignr xmm12,xmm12,4
paddd xmm1,xmm5
pxor xmm13,xmm1
pshufb xmm13,XMMWORD[$L$rol16]
@@ -3198,9 +3198,9 @@
pslld xmm3,7
psrld xmm5,25
pxor xmm5,xmm3
-DB 102,15,58,15,237,12
-DB 102,69,15,58,15,201,8
-DB 102,69,15,58,15,237,4
+ palignr xmm5,xmm5,12
+ palignr xmm9,xmm9,8
+ palignr xmm13,xmm13,4
lea rdi,[16+rdi]
dec rcx
@@ -3315,9 +3315,9 @@
pslld xmm3,7
psrld xmm4,25
pxor xmm4,xmm3
-DB 102,15,58,15,228,4
-DB 102,69,15,58,15,192,8
-DB 102,69,15,58,15,228,12
+ palignr xmm4,xmm4,4
+ palignr xmm8,xmm8,8
+ palignr xmm12,xmm12,12
paddd xmm1,xmm5
pxor xmm13,xmm1
pshufb xmm13,XMMWORD[$L$rol16]
@@ -3336,9 +3336,9 @@
pslld xmm3,7
psrld xmm5,25
pxor xmm5,xmm3
-DB 102,15,58,15,237,4
-DB 102,69,15,58,15,201,8
-DB 102,69,15,58,15,237,12
+ palignr xmm5,xmm5,4
+ palignr xmm9,xmm9,8
+ palignr xmm13,xmm13,12
paddd xmm2,xmm6
pxor xmm14,xmm2
pshufb xmm14,XMMWORD[$L$rol16]
@@ -3357,9 +3357,9 @@
pslld xmm3,7
psrld xmm6,25
pxor xmm6,xmm3
-DB 102,15,58,15,246,4
-DB 102,69,15,58,15,210,8
-DB 102,69,15,58,15,246,12
+ palignr xmm6,xmm6,4
+ palignr xmm10,xmm10,8
+ palignr xmm14,xmm14,12
add r10,QWORD[((0+0))+rdi]
adc r11,QWORD[((8+0))+rdi]
adc r12,1
@@ -3418,9 +3418,9 @@
pslld xmm3,7
psrld xmm4,25
pxor xmm4,xmm3
-DB 102,15,58,15,228,12
-DB 102,69,15,58,15,192,8
-DB 102,69,15,58,15,228,4
+ palignr xmm4,xmm4,12
+ palignr xmm8,xmm8,8
+ palignr xmm12,xmm12,4
paddd xmm1,xmm5
pxor xmm13,xmm1
pshufb xmm13,XMMWORD[$L$rol16]
@@ -3439,9 +3439,9 @@
pslld xmm3,7
psrld xmm5,25
pxor xmm5,xmm3
-DB 102,15,58,15,237,12
-DB 102,69,15,58,15,201,8
-DB 102,69,15,58,15,237,4
+ palignr xmm5,xmm5,12
+ palignr xmm9,xmm9,8
+ palignr xmm13,xmm13,4
paddd xmm2,xmm6
pxor xmm14,xmm2
pshufb xmm14,XMMWORD[$L$rol16]
@@ -3460,9 +3460,9 @@
pslld xmm3,7
psrld xmm6,25
pxor xmm6,xmm3
-DB 102,15,58,15,246,12
-DB 102,69,15,58,15,210,8
-DB 102,69,15,58,15,246,4
+ palignr xmm6,xmm6,12
+ palignr xmm10,xmm10,8
+ palignr xmm14,xmm14,4
lea rdi,[16+rdi]
dec rcx
@@ -3714,7 +3714,7 @@
-DB 102,77,15,126,253
+ movq r13,xmm15
pextrq r14,xmm15,1
add r10,r13
adc r11,r14
@@ -3830,7 +3830,7 @@
lea r15,[$L$and_masks]
shl rbx,4
pand xmm15,XMMWORD[((-16))+rbx*1+r15]
-DB 102,77,15,126,253
+ movq r13,xmm15
pextrq r14,xmm15,1
add r10,r13
adc r11,r14
@@ -4005,9 +4005,9 @@
pslld xmm3,7
psrld xmm4,25
pxor xmm4,xmm3
-DB 102,15,58,15,228,4
-DB 102,69,15,58,15,192,8
-DB 102,69,15,58,15,228,12
+ palignr xmm4,xmm4,4
+ palignr xmm8,xmm8,8
+ palignr xmm12,xmm12,12
paddd xmm1,xmm5
pxor xmm13,xmm1
pshufb xmm13,XMMWORD[$L$rol16]
@@ -4026,9 +4026,9 @@
pslld xmm3,7
psrld xmm5,25
pxor xmm5,xmm3
-DB 102,15,58,15,237,4
-DB 102,69,15,58,15,201,8
-DB 102,69,15,58,15,237,12
+ palignr xmm5,xmm5,4
+ palignr xmm9,xmm9,8
+ palignr xmm13,xmm13,12
paddd xmm2,xmm6
pxor xmm14,xmm2
pshufb xmm14,XMMWORD[$L$rol16]
@@ -4047,9 +4047,9 @@
pslld xmm3,7
psrld xmm6,25
pxor xmm6,xmm3
-DB 102,15,58,15,246,4
-DB 102,69,15,58,15,210,8
-DB 102,69,15,58,15,246,12
+ palignr xmm6,xmm6,4
+ palignr xmm10,xmm10,8
+ palignr xmm14,xmm14,12
paddd xmm0,xmm4
pxor xmm12,xmm0
pshufb xmm12,XMMWORD[$L$rol16]
@@ -4068,9 +4068,9 @@
pslld xmm3,7
psrld xmm4,25
pxor xmm4,xmm3
-DB 102,15,58,15,228,12
-DB 102,69,15,58,15,192,8
-DB 102,69,15,58,15,228,4
+ palignr xmm4,xmm4,12
+ palignr xmm8,xmm8,8
+ palignr xmm12,xmm12,4
paddd xmm1,xmm5
pxor xmm13,xmm1
pshufb xmm13,XMMWORD[$L$rol16]
@@ -4089,9 +4089,9 @@
pslld xmm3,7
psrld xmm5,25
pxor xmm5,xmm3
-DB 102,15,58,15,237,12
-DB 102,69,15,58,15,201,8
-DB 102,69,15,58,15,237,4
+ palignr xmm5,xmm5,12
+ palignr xmm9,xmm9,8
+ palignr xmm13,xmm13,4
paddd xmm2,xmm6
pxor xmm14,xmm2
pshufb xmm14,XMMWORD[$L$rol16]
@@ -4110,9 +4110,9 @@
pslld xmm3,7
psrld xmm6,25
pxor xmm6,xmm3
-DB 102,15,58,15,246,12
-DB 102,69,15,58,15,210,8
-DB 102,69,15,58,15,246,4
+ palignr xmm6,xmm6,12
+ palignr xmm10,xmm10,8
+ palignr xmm14,xmm14,4
dec r10
jnz NEAR $L$seal_sse_128_rounds