Adam Langley | 95c29f3 | 2014-06-20 12:00:00 -0700 | [diff] [blame] | 1 | #!/usr/bin/env perl |
| 2 | # |
| 3 | # ==================================================================== |
| 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
| 5 | # project. The module is, however, dual licensed under OpenSSL and |
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further |
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. |
| 8 | # ==================================================================== |
| 9 | # |
| 10 | # SHA256 block transform for x86. September 2007. |
| 11 | # |
| 12 | # Performance improvement over compiler generated code varies from |
David Benjamin | a3a80b2 | 2015-07-21 22:29:26 -0400 | [diff] [blame] | 13 | # 10% to 40% [see below]. Not very impressive on some ยต-archs, but |
Adam Langley | 95c29f3 | 2014-06-20 12:00:00 -0700 | [diff] [blame] | 14 | # it's 5 times smaller and optimizies amount of writes. |
| 15 | # |
| 16 | # May 2012. |
| 17 | # |
| 18 | # Optimization including two of Pavel Semjanov's ideas, alternative |
| 19 | # Maj and full unroll, resulted in ~20-25% improvement on most CPUs, |
| 20 | # ~7% on Pentium, ~40% on Atom. As fully unrolled loop body is almost |
| 21 | # 15x larger, 8KB vs. 560B, it's fired only for longer inputs. But not |
| 22 | # on P4, where it kills performance, nor Sandy Bridge, where folded |
| 23 | # loop is approximately as fast... |
| 24 | # |
| 25 | # June 2012. |
| 26 | # |
| 27 | # Add AMD XOP-specific code path, >30% improvement on Bulldozer over |
| 28 | # May version, >60% over original. Add AVX+shrd code path, >25% |
| 29 | # improvement on Sandy Bridge over May version, 60% over original. |
| 30 | # |
| 31 | # May 2013. |
| 32 | # |
| 33 | # Replace AMD XOP code path with SSSE3 to cover more processors. |
| 34 | # (Biggest improvement coefficient is on upcoming Atom Silvermont, |
| 35 | # not shown.) Add AVX+BMI code path. |
| 36 | # |
Adam Langley | cb5dd63 | 2014-06-20 12:00:00 -0700 | [diff] [blame] | 37 | # March 2014. |
| 38 | # |
| 39 | # Add support for Intel SHA Extensions. |
| 40 | # |
Adam Langley | 95c29f3 | 2014-06-20 12:00:00 -0700 | [diff] [blame] | 41 | # Performance in clock cycles per processed byte (less is better): |
| 42 | # |
Adam Langley | c948d46 | 2017-02-09 12:21:08 -0800 | [diff] [blame] | 43 | # gcc icc x86 asm(*) SIMD x86_64 asm(**) |
Adam Langley | 95c29f3 | 2014-06-20 12:00:00 -0700 | [diff] [blame] | 44 | # Pentium 46 57 40/38 - - |
| 45 | # PIII 36 33 27/24 - - |
| 46 | # P4 41 38 28 - 17.3 |
| 47 | # AMD K8 27 25 19/15.5 - 14.9 |
| 48 | # Core2 26 23 18/15.6 14.3 13.8 |
| 49 | # Westmere 27 - 19/15.7 13.4 12.3 |
| 50 | # Sandy Bridge 25 - 15.9 12.4 11.6 |
| 51 | # Ivy Bridge 24 - 15.0 11.4 10.3 |
| 52 | # Haswell 22 - 13.9 9.46 7.80 |
Adam Langley | cf9a98c | 2017-02-09 14:13:47 -0800 | [diff] [blame] | 53 | # Skylake 20 - 14.9 9.50 7.70 |
Adam Langley | 95c29f3 | 2014-06-20 12:00:00 -0700 | [diff] [blame] | 54 | # Bulldozer 36 - 27/22 17.0 13.6 |
| 55 | # VIA Nano 36 - 25/22 16.8 16.5 |
| 56 | # Atom 50 - 30/25 21.9 18.9 |
Adam Langley | 3dfbcc1 | 2014-11-03 18:52:27 -0800 | [diff] [blame] | 57 | # Silvermont 40 - 34/31 22.9 20.6 |
Adam Langley | cf9a98c | 2017-02-09 14:13:47 -0800 | [diff] [blame] | 58 | # Goldmont 29 - 20 16.3(***) |
Adam Langley | 95c29f3 | 2014-06-20 12:00:00 -0700 | [diff] [blame] | 59 | # |
| 60 | # (*) numbers after slash are for unrolled loop, where applicable; |
| 61 | # (**) x86_64 assembly performance is presented for reference |
| 62 | # purposes, results are best-available; |
Adam Langley | cf9a98c | 2017-02-09 14:13:47 -0800 | [diff] [blame] | 63 | # (***) SHAEXT result is 4.1, strangely enough better than 64-bit one; |
Adam Langley | 95c29f3 | 2014-06-20 12:00:00 -0700 | [diff] [blame] | 64 | |
| 65 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
Adam Langley | fd49993 | 2017-04-04 14:21:43 -0700 | [diff] [blame^] | 66 | push(@INC,"${dir}","${dir}../../../perlasm"); |
Adam Langley | 95c29f3 | 2014-06-20 12:00:00 -0700 | [diff] [blame] | 67 | require "x86asm.pl"; |
| 68 | |
David Benjamin | fdd8e9c | 2016-06-26 13:18:50 -0400 | [diff] [blame] | 69 | $output=pop; |
| 70 | open STDOUT,">$output"; |
| 71 | |
Adam Langley | 95c29f3 | 2014-06-20 12:00:00 -0700 | [diff] [blame] | 72 | &asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386"); |
| 73 | |
| 74 | $xmm=$avx=0; |
| 75 | for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } |
| 76 | |
David Benjamin | 278d342 | 2015-10-14 14:03:23 -0400 | [diff] [blame] | 77 | # In upstream, this is controlled by shelling out to the compiler to check |
| 78 | # versions, but BoringSSL is intended to be used with pre-generated perlasm |
| 79 | # output, so this isn't useful anyway. |
| 80 | # |
David Benjamin | ce7ae6f | 2015-11-09 22:07:24 -0500 | [diff] [blame] | 81 | # TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. |
| 82 | $avx = 1; |
David Benjamin | 75885e2 | 2015-10-26 22:15:26 +0000 | [diff] [blame] | 83 | |
David Benjamin | 278d342 | 2015-10-14 14:03:23 -0400 | [diff] [blame] | 84 | $avx = 0 unless ($xmm); |
Adam Langley | 2811da2 | 2014-07-24 17:28:34 -0700 | [diff] [blame] | 85 | |
Adam Langley | 006779a | 2014-06-20 12:00:00 -0700 | [diff] [blame] | 86 | $shaext=$xmm; ### set to zero if compiling for 1.0.1 |
| 87 | |
David Benjamin | e189c86 | 2015-10-15 13:48:50 -0400 | [diff] [blame] | 88 | # TODO(davidben): Consider enabling the Intel SHA Extensions code once it's |
| 89 | # been tested. |
| 90 | $shaext = 0; |
| 91 | |
Adam Langley | 95c29f3 | 2014-06-20 12:00:00 -0700 | [diff] [blame] | 92 | $unroll_after = 64*4; # If pre-evicted from L1P cache first spin of |
| 93 | # fully unrolled loop was measured to run about |
| 94 | # 3-4x slower. If slowdown coefficient is N and |
| 95 | # unrolled loop is m times faster, then you break |
| 96 | # even at (N-1)/(m-1) blocks. Then it needs to be |
| 97 | # adjusted for probability of code being evicted, |
| 98 | # code size/cache size=1/4. Typical m is 1.15... |
| 99 | |
| 100 | $A="eax"; |
| 101 | $E="edx"; |
| 102 | $T="ebx"; |
| 103 | $Aoff=&DWP(4,"esp"); |
| 104 | $Boff=&DWP(8,"esp"); |
| 105 | $Coff=&DWP(12,"esp"); |
| 106 | $Doff=&DWP(16,"esp"); |
| 107 | $Eoff=&DWP(20,"esp"); |
| 108 | $Foff=&DWP(24,"esp"); |
| 109 | $Goff=&DWP(28,"esp"); |
| 110 | $Hoff=&DWP(32,"esp"); |
| 111 | $Xoff=&DWP(36,"esp"); |
| 112 | $K256="ebp"; |
| 113 | |
| 114 | sub BODY_16_63() { |
| 115 | &mov ($T,"ecx"); # "ecx" is preloaded |
| 116 | &mov ("esi",&DWP(4*(9+15+16-14),"esp")); |
| 117 | &ror ("ecx",18-7); |
| 118 | &mov ("edi","esi"); |
| 119 | &ror ("esi",19-17); |
| 120 | &xor ("ecx",$T); |
| 121 | &shr ($T,3); |
| 122 | &ror ("ecx",7); |
| 123 | &xor ("esi","edi"); |
| 124 | &xor ($T,"ecx"); # T = sigma0(X[-15]) |
| 125 | &ror ("esi",17); |
| 126 | &add ($T,&DWP(4*(9+15+16),"esp")); # T += X[-16] |
| 127 | &shr ("edi",10); |
| 128 | &add ($T,&DWP(4*(9+15+16-9),"esp")); # T += X[-7] |
| 129 | #&xor ("edi","esi") # sigma1(X[-2]) |
| 130 | # &add ($T,"edi"); # T += sigma1(X[-2]) |
| 131 | # &mov (&DWP(4*(9+15),"esp"),$T); # save X[0] |
| 132 | |
| 133 | &BODY_00_15(1); |
| 134 | } |
| 135 | sub BODY_00_15() { |
| 136 | my $in_16_63=shift; |
| 137 | |
| 138 | &mov ("ecx",$E); |
| 139 | &xor ("edi","esi") if ($in_16_63); # sigma1(X[-2]) |
| 140 | &mov ("esi",$Foff); |
| 141 | &ror ("ecx",25-11); |
| 142 | &add ($T,"edi") if ($in_16_63); # T += sigma1(X[-2]) |
| 143 | &mov ("edi",$Goff); |
| 144 | &xor ("ecx",$E); |
| 145 | &xor ("esi","edi"); |
| 146 | &mov ($T,&DWP(4*(9+15),"esp")) if (!$in_16_63); |
| 147 | &mov (&DWP(4*(9+15),"esp"),$T) if ($in_16_63); # save X[0] |
| 148 | &ror ("ecx",11-6); |
| 149 | &and ("esi",$E); |
| 150 | &mov ($Eoff,$E); # modulo-scheduled |
| 151 | &xor ($E,"ecx"); |
| 152 | &add ($T,$Hoff); # T += h |
| 153 | &xor ("esi","edi"); # Ch(e,f,g) |
| 154 | &ror ($E,6); # Sigma1(e) |
| 155 | &mov ("ecx",$A); |
| 156 | &add ($T,"esi"); # T += Ch(e,f,g) |
| 157 | |
| 158 | &ror ("ecx",22-13); |
| 159 | &add ($T,$E); # T += Sigma1(e) |
| 160 | &mov ("edi",$Boff); |
| 161 | &xor ("ecx",$A); |
| 162 | &mov ($Aoff,$A); # modulo-scheduled |
| 163 | &lea ("esp",&DWP(-4,"esp")); |
| 164 | &ror ("ecx",13-2); |
| 165 | &mov ("esi",&DWP(0,$K256)); |
| 166 | &xor ("ecx",$A); |
| 167 | &mov ($E,$Eoff); # e in next iteration, d in this one |
| 168 | &xor ($A,"edi"); # a ^= b |
| 169 | &ror ("ecx",2); # Sigma0(a) |
| 170 | |
| 171 | &add ($T,"esi"); # T+= K[i] |
| 172 | &mov (&DWP(0,"esp"),$A); # (b^c) in next round |
| 173 | &add ($E,$T); # d += T |
| 174 | &and ($A,&DWP(4,"esp")); # a &= (b^c) |
| 175 | &add ($T,"ecx"); # T += Sigma0(a) |
| 176 | &xor ($A,"edi"); # h = Maj(a,b,c) = Ch(a^b,c,b) |
| 177 | &mov ("ecx",&DWP(4*(9+15+16-1),"esp")) if ($in_16_63); # preload T |
| 178 | &add ($K256,4); |
| 179 | &add ($A,$T); # h += T |
| 180 | } |
| 181 | |
| 182 | &external_label("OPENSSL_ia32cap_P") if (!$i386); |
| 183 | |
| 184 | &function_begin("sha256_block_data_order"); |
| 185 | &mov ("esi",wparam(0)); # ctx |
| 186 | &mov ("edi",wparam(1)); # inp |
| 187 | &mov ("eax",wparam(2)); # num |
| 188 | &mov ("ebx","esp"); # saved sp |
| 189 | |
| 190 | &call (&label("pic_point")); # make it PIC! |
| 191 | &set_label("pic_point"); |
| 192 | &blindpop($K256); |
| 193 | &lea ($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256)); |
| 194 | |
| 195 | &sub ("esp",16); |
| 196 | &and ("esp",-64); |
| 197 | |
| 198 | &shl ("eax",6); |
| 199 | &add ("eax","edi"); |
| 200 | &mov (&DWP(0,"esp"),"esi"); # ctx |
| 201 | &mov (&DWP(4,"esp"),"edi"); # inp |
| 202 | &mov (&DWP(8,"esp"),"eax"); # inp+num*128 |
| 203 | &mov (&DWP(12,"esp"),"ebx"); # saved sp |
Adam Langley | 4313428 | 2014-06-20 12:00:00 -0700 | [diff] [blame] | 204 | if (!$i386 && $xmm) { |
Adam Langley | 95c29f3 | 2014-06-20 12:00:00 -0700 | [diff] [blame] | 205 | &picmeup("edx","OPENSSL_ia32cap_P",$K256,&label("K256")); |
| 206 | &mov ("ecx",&DWP(0,"edx")); |
| 207 | &mov ("ebx",&DWP(4,"edx")); |
| 208 | &test ("ecx",1<<20); # check for P4 |
| 209 | &jnz (&label("loop")); |
Adam Langley | cb5dd63 | 2014-06-20 12:00:00 -0700 | [diff] [blame] | 210 | &mov ("edx",&DWP(8,"edx")) if ($xmm); |
| 211 | &test ("ecx",1<<24); # check for FXSR |
| 212 | &jz ($unroll_after?&label("no_xmm"):&label("loop")); |
Adam Langley | 95c29f3 | 2014-06-20 12:00:00 -0700 | [diff] [blame] | 213 | &and ("ecx",1<<30); # mask "Intel CPU" bit |
| 214 | &and ("ebx",1<<28|1<<9); # mask AVX and SSSE3 bits |
Adam Langley | 006779a | 2014-06-20 12:00:00 -0700 | [diff] [blame] | 215 | &test ("edx",1<<29) if ($shaext); # check for SHA |
| 216 | &jnz (&label("shaext")) if ($shaext); |
Adam Langley | 95c29f3 | 2014-06-20 12:00:00 -0700 | [diff] [blame] | 217 | &or ("ecx","ebx"); |
| 218 | &and ("ecx",1<<28|1<<30); |
| 219 | &cmp ("ecx",1<<28|1<<30); |
| 220 | if ($xmm) { |
| 221 | &je (&label("AVX")) if ($avx); |
| 222 | &test ("ebx",1<<9); # check for SSSE3 |
| 223 | &jnz (&label("SSSE3")); |
| 224 | } else { |
| 225 | &je (&label("loop_shrd")); |
| 226 | } |
| 227 | if ($unroll_after) { |
Adam Langley | cb5dd63 | 2014-06-20 12:00:00 -0700 | [diff] [blame] | 228 | &set_label("no_xmm"); |
Adam Langley | 95c29f3 | 2014-06-20 12:00:00 -0700 | [diff] [blame] | 229 | &sub ("eax","edi"); |
| 230 | &cmp ("eax",$unroll_after); |
| 231 | &jae (&label("unrolled")); |
| 232 | } } |
| 233 | &jmp (&label("loop")); |
| 234 | |
| 235 | sub COMPACT_LOOP() { |
| 236 | my $suffix=shift; |
| 237 | |
| 238 | &set_label("loop$suffix",$suffix?32:16); |
| 239 | # copy input block to stack reversing byte and dword order |
| 240 | for($i=0;$i<4;$i++) { |
| 241 | &mov ("eax",&DWP($i*16+0,"edi")); |
| 242 | &mov ("ebx",&DWP($i*16+4,"edi")); |
| 243 | &mov ("ecx",&DWP($i*16+8,"edi")); |
| 244 | &bswap ("eax"); |
| 245 | &mov ("edx",&DWP($i*16+12,"edi")); |
| 246 | &bswap ("ebx"); |
| 247 | &push ("eax"); |
| 248 | &bswap ("ecx"); |
| 249 | &push ("ebx"); |
| 250 | &bswap ("edx"); |
| 251 | &push ("ecx"); |
| 252 | &push ("edx"); |
| 253 | } |
| 254 | &add ("edi",64); |
| 255 | &lea ("esp",&DWP(-4*9,"esp"));# place for A,B,C,D,E,F,G,H |
| 256 | &mov (&DWP(4*(9+16)+4,"esp"),"edi"); |
| 257 | |
| 258 | # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack |
| 259 | &mov ($A,&DWP(0,"esi")); |
| 260 | &mov ("ebx",&DWP(4,"esi")); |
| 261 | &mov ("ecx",&DWP(8,"esi")); |
| 262 | &mov ("edi",&DWP(12,"esi")); |
| 263 | # &mov ($Aoff,$A); |
| 264 | &mov ($Boff,"ebx"); |
| 265 | &xor ("ebx","ecx"); |
| 266 | &mov ($Coff,"ecx"); |
| 267 | &mov ($Doff,"edi"); |
| 268 | &mov (&DWP(0,"esp"),"ebx"); # magic |
Adam Langley | c948d46 | 2017-02-09 12:21:08 -0800 | [diff] [blame] | 269 | &mov ($E,&DWP(16,"esi")); |
Adam Langley | 95c29f3 | 2014-06-20 12:00:00 -0700 | [diff] [blame] | 270 | &mov ("ebx",&DWP(20,"esi")); |
| 271 | &mov ("ecx",&DWP(24,"esi")); |
| 272 | &mov ("edi",&DWP(28,"esi")); |
| 273 | # &mov ($Eoff,$E); |
| 274 | &mov ($Foff,"ebx"); |
| 275 | &mov ($Goff,"ecx"); |
| 276 | &mov ($Hoff,"edi"); |
| 277 | |
| 278 | &set_label("00_15$suffix",16); |
| 279 | |
| 280 | &BODY_00_15(); |
| 281 | |
| 282 | &cmp ("esi",0xc19bf174); |
| 283 | &jne (&label("00_15$suffix")); |
| 284 | |
| 285 | &mov ("ecx",&DWP(4*(9+15+16-1),"esp")); # preloaded in BODY_00_15(1) |
| 286 | &jmp (&label("16_63$suffix")); |
| 287 | |
| 288 | &set_label("16_63$suffix",16); |
| 289 | |
| 290 | &BODY_16_63(); |
| 291 | |
| 292 | &cmp ("esi",0xc67178f2); |
| 293 | &jne (&label("16_63$suffix")); |
| 294 | |
| 295 | &mov ("esi",&DWP(4*(9+16+64)+0,"esp"));#ctx |
| 296 | # &mov ($A,$Aoff); |
| 297 | &mov ("ebx",$Boff); |
| 298 | # &mov ("edi",$Coff); |
| 299 | &mov ("ecx",$Doff); |
| 300 | &add ($A,&DWP(0,"esi")); |
| 301 | &add ("ebx",&DWP(4,"esi")); |
| 302 | &add ("edi",&DWP(8,"esi")); |
| 303 | &add ("ecx",&DWP(12,"esi")); |
| 304 | &mov (&DWP(0,"esi"),$A); |
| 305 | &mov (&DWP(4,"esi"),"ebx"); |
| 306 | &mov (&DWP(8,"esi"),"edi"); |
| 307 | &mov (&DWP(12,"esi"),"ecx"); |
| 308 | # &mov ($E,$Eoff); |
| 309 | &mov ("eax",$Foff); |
| 310 | &mov ("ebx",$Goff); |
| 311 | &mov ("ecx",$Hoff); |
| 312 | &mov ("edi",&DWP(4*(9+16+64)+4,"esp"));#inp |
| 313 | &add ($E,&DWP(16,"esi")); |
| 314 | &add ("eax",&DWP(20,"esi")); |
| 315 | &add ("ebx",&DWP(24,"esi")); |
| 316 | &add ("ecx",&DWP(28,"esi")); |
| 317 | &mov (&DWP(16,"esi"),$E); |
| 318 | &mov (&DWP(20,"esi"),"eax"); |
| 319 | &mov (&DWP(24,"esi"),"ebx"); |
| 320 | &mov (&DWP(28,"esi"),"ecx"); |
| 321 | |
| 322 | &lea ("esp",&DWP(4*(9+16+64),"esp"));# destroy frame |
| 323 | &sub ($K256,4*64); # rewind K |
| 324 | |
| 325 | &cmp ("edi",&DWP(8,"esp")); # are we done yet? |
| 326 | &jb (&label("loop$suffix")); |
| 327 | } |
| 328 | &COMPACT_LOOP(); |
| 329 | &mov ("esp",&DWP(12,"esp")); # restore sp |
| 330 | &function_end_A(); |
| 331 | if (!$i386 && !$xmm) { |
| 332 | # ~20% improvement on Sandy Bridge |
| 333 | local *ror = sub { &shrd(@_[0],@_) }; |
| 334 | &COMPACT_LOOP("_shrd"); |
| 335 | &mov ("esp",&DWP(12,"esp")); # restore sp |
| 336 | &function_end_A(); |
| 337 | } |
| 338 | |
| 339 | &set_label("K256",64); # Yes! I keep it in the code segment! |
| 340 | @K256=( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5, |
| 341 | 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, |
| 342 | 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3, |
| 343 | 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174, |
| 344 | 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc, |
| 345 | 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da, |
| 346 | 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7, |
| 347 | 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967, |
| 348 | 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13, |
| 349 | 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85, |
| 350 | 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3, |
| 351 | 0xd192e819,0xd6990624,0xf40e3585,0x106aa070, |
| 352 | 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5, |
| 353 | 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3, |
| 354 | 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208, |
| 355 | 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 ); |
| 356 | &data_word(@K256); |
| 357 | &data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # byte swap mask |
| 358 | &asciz("SHA256 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>"); |
| 359 | |
| 360 | ($a,$b,$c,$d,$e,$f,$g,$h)=(0..7); # offsets |
| 361 | sub off { &DWP(4*(((shift)-$i)&7),"esp"); } |
| 362 | |
| 363 | if (!$i386 && $unroll_after) { |
| 364 | my @AH=($A,$K256); |
| 365 | |
| 366 | &set_label("unrolled",16); |
| 367 | &lea ("esp",&DWP(-96,"esp")); |
| 368 | # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack |
| 369 | &mov ($AH[0],&DWP(0,"esi")); |
| 370 | &mov ($AH[1],&DWP(4,"esi")); |
| 371 | &mov ("ecx",&DWP(8,"esi")); |
| 372 | &mov ("ebx",&DWP(12,"esi")); |
| 373 | #&mov (&DWP(0,"esp"),$AH[0]); |
| 374 | &mov (&DWP(4,"esp"),$AH[1]); |
| 375 | &xor ($AH[1],"ecx"); # magic |
| 376 | &mov (&DWP(8,"esp"),"ecx"); |
| 377 | &mov (&DWP(12,"esp"),"ebx"); |
Adam Langley | c948d46 | 2017-02-09 12:21:08 -0800 | [diff] [blame] | 378 | &mov ($E,&DWP(16,"esi")); |
Adam Langley | 95c29f3 | 2014-06-20 12:00:00 -0700 | [diff] [blame] | 379 | &mov ("ebx",&DWP(20,"esi")); |
| 380 | &mov ("ecx",&DWP(24,"esi")); |
| 381 | &mov ("esi",&DWP(28,"esi")); |
| 382 | #&mov (&DWP(16,"esp"),$E); |
| 383 | &mov (&DWP(20,"esp"),"ebx"); |
| 384 | &mov (&DWP(24,"esp"),"ecx"); |
| 385 | &mov (&DWP(28,"esp"),"esi"); |
| 386 | &jmp (&label("grand_loop")); |
| 387 | |
| 388 | &set_label("grand_loop",16); |
| 389 | # copy input block to stack reversing byte order |
| 390 | for($i=0;$i<5;$i++) { |
| 391 | &mov ("ebx",&DWP(12*$i+0,"edi")); |
| 392 | &mov ("ecx",&DWP(12*$i+4,"edi")); |
| 393 | &bswap ("ebx"); |
| 394 | &mov ("esi",&DWP(12*$i+8,"edi")); |
| 395 | &bswap ("ecx"); |
| 396 | &mov (&DWP(32+12*$i+0,"esp"),"ebx"); |
| 397 | &bswap ("esi"); |
| 398 | &mov (&DWP(32+12*$i+4,"esp"),"ecx"); |
| 399 | &mov (&DWP(32+12*$i+8,"esp"),"esi"); |
| 400 | } |
| 401 | &mov ("ebx",&DWP($i*12,"edi")); |
| 402 | &add ("edi",64); |
| 403 | &bswap ("ebx"); |
| 404 | &mov (&DWP(96+4,"esp"),"edi"); |
| 405 | &mov (&DWP(32+12*$i,"esp"),"ebx"); |
| 406 | |
| 407 | my ($t1,$t2) = ("ecx","esi"); |
| 408 | |
| 409 | for ($i=0;$i<64;$i++) { |
| 410 | |
| 411 | if ($i>=16) { |
| 412 | &mov ($T,$t1); # $t1 is preloaded |
| 413 | # &mov ($t2,&DWP(32+4*(($i+14)&15),"esp")); |
| 414 | &ror ($t1,18-7); |
| 415 | &mov ("edi",$t2); |
| 416 | &ror ($t2,19-17); |
| 417 | &xor ($t1,$T); |
| 418 | &shr ($T,3); |
| 419 | &ror ($t1,7); |
| 420 | &xor ($t2,"edi"); |
| 421 | &xor ($T,$t1); # T = sigma0(X[-15]) |
| 422 | &ror ($t2,17); |
| 423 | &add ($T,&DWP(32+4*($i&15),"esp")); # T += X[-16] |
| 424 | &shr ("edi",10); |
| 425 | &add ($T,&DWP(32+4*(($i+9)&15),"esp")); # T += X[-7] |
| 426 | #&xor ("edi",$t2) # sigma1(X[-2]) |
| 427 | # &add ($T,"edi"); # T += sigma1(X[-2]) |
| 428 | # &mov (&DWP(4*(9+15),"esp"),$T); # save X[0] |
| 429 | } |
| 430 | &mov ($t1,$E); |
| 431 | &xor ("edi",$t2) if ($i>=16); # sigma1(X[-2]) |
| 432 | &mov ($t2,&off($f)); |
| 433 | &ror ($E,25-11); |
| 434 | &add ($T,"edi") if ($i>=16); # T += sigma1(X[-2]) |
| 435 | &mov ("edi",&off($g)); |
| 436 | &xor ($E,$t1); |
| 437 | &mov ($T,&DWP(32+4*($i&15),"esp")) if ($i<16); # X[i] |
| 438 | &mov (&DWP(32+4*($i&15),"esp"),$T) if ($i>=16 && $i<62); # save X[0] |
| 439 | &xor ($t2,"edi"); |
| 440 | &ror ($E,11-6); |
| 441 | &and ($t2,$t1); |
| 442 | &mov (&off($e),$t1); # save $E, modulo-scheduled |
| 443 | &xor ($E,$t1); |
| 444 | &add ($T,&off($h)); # T += h |
| 445 | &xor ("edi",$t2); # Ch(e,f,g) |
| 446 | &ror ($E,6); # Sigma1(e) |
| 447 | &mov ($t1,$AH[0]); |
| 448 | &add ($T,"edi"); # T += Ch(e,f,g) |
| 449 | |
| 450 | &ror ($t1,22-13); |
| 451 | &mov ($t2,$AH[0]); |
| 452 | &mov ("edi",&off($b)); |
| 453 | &xor ($t1,$AH[0]); |
| 454 | &mov (&off($a),$AH[0]); # save $A, modulo-scheduled |
| 455 | &xor ($AH[0],"edi"); # a ^= b, (b^c) in next round |
| 456 | &ror ($t1,13-2); |
| 457 | &and ($AH[1],$AH[0]); # (b^c) &= (a^b) |
| 458 | &lea ($E,&DWP(@K256[$i],$T,$E)); # T += Sigma1(1)+K[i] |
| 459 | &xor ($t1,$t2); |
| 460 | &xor ($AH[1],"edi"); # h = Maj(a,b,c) = Ch(a^b,c,b) |
| 461 | &mov ($t2,&DWP(32+4*(($i+2)&15),"esp")) if ($i>=15 && $i<63); |
| 462 | &ror ($t1,2); # Sigma0(a) |
| 463 | |
| 464 | &add ($AH[1],$E); # h += T |
| 465 | &add ($E,&off($d)); # d += T |
| 466 | &add ($AH[1],$t1); # h += Sigma0(a) |
| 467 | &mov ($t1,&DWP(32+4*(($i+15)&15),"esp")) if ($i>=15 && $i<63); |
| 468 | |
| 469 | @AH = reverse(@AH); # rotate(a,h) |
| 470 | ($t1,$t2) = ($t2,$t1); # rotate(t1,t2) |
| 471 | } |
| 472 | &mov ("esi",&DWP(96,"esp")); #ctx |
| 473 | #&mov ($AH[0],&DWP(0,"esp")); |
| 474 | &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); |
| 475 | #&mov ("edi", &DWP(8,"esp")); |
| 476 | &mov ("ecx",&DWP(12,"esp")); |
| 477 | &add ($AH[0],&DWP(0,"esi")); |
| 478 | &add ($AH[1],&DWP(4,"esi")); |
| 479 | &add ("edi",&DWP(8,"esi")); |
| 480 | &add ("ecx",&DWP(12,"esi")); |
| 481 | &mov (&DWP(0,"esi"),$AH[0]); |
| 482 | &mov (&DWP(4,"esi"),$AH[1]); |
| 483 | &mov (&DWP(8,"esi"),"edi"); |
| 484 | &mov (&DWP(12,"esi"),"ecx"); |
| 485 | #&mov (&DWP(0,"esp"),$AH[0]); |
| 486 | &mov (&DWP(4,"esp"),$AH[1]); |
| 487 | &xor ($AH[1],"edi"); # magic |
| 488 | &mov (&DWP(8,"esp"),"edi"); |
| 489 | &mov (&DWP(12,"esp"),"ecx"); |
| 490 | #&mov ($E,&DWP(16,"esp")); |
| 491 | &mov ("edi",&DWP(20,"esp")); |
| 492 | &mov ("ebx",&DWP(24,"esp")); |
| 493 | &mov ("ecx",&DWP(28,"esp")); |
| 494 | &add ($E,&DWP(16,"esi")); |
| 495 | &add ("edi",&DWP(20,"esi")); |
| 496 | &add ("ebx",&DWP(24,"esi")); |
| 497 | &add ("ecx",&DWP(28,"esi")); |
| 498 | &mov (&DWP(16,"esi"),$E); |
| 499 | &mov (&DWP(20,"esi"),"edi"); |
| 500 | &mov (&DWP(24,"esi"),"ebx"); |
| 501 | &mov (&DWP(28,"esi"),"ecx"); |
| 502 | #&mov (&DWP(16,"esp"),$E); |
| 503 | &mov (&DWP(20,"esp"),"edi"); |
| 504 | &mov ("edi",&DWP(96+4,"esp")); # inp |
| 505 | &mov (&DWP(24,"esp"),"ebx"); |
| 506 | &mov (&DWP(28,"esp"),"ecx"); |
| 507 | |
| 508 | &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? |
| 509 | &jb (&label("grand_loop")); |
| 510 | |
| 511 | &mov ("esp",&DWP(96+12,"esp")); # restore sp |
| 512 | &function_end_A(); |
| 513 | } |
| 514 | if (!$i386 && $xmm) {{{ |
Adam Langley | 006779a | 2014-06-20 12:00:00 -0700 | [diff] [blame] | 515 | if ($shaext) { |
Adam Langley | cb5dd63 | 2014-06-20 12:00:00 -0700 | [diff] [blame] | 516 | ###################################################################### |
| 517 | # Intel SHA Extensions implementation of SHA256 update function. |
| 518 | # |
| 519 | my ($ctx,$inp,$end)=("esi","edi","eax"); |
| 520 | my ($Wi,$ABEF,$CDGH,$TMP)=map("xmm$_",(0..2,7)); |
| 521 | my @MSG=map("xmm$_",(3..6)); |
| 522 | |
| 523 | sub sha256op38 { |
| 524 | my ($opcodelet,$dst,$src)=@_; |
| 525 | if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) |
| 526 | { &data_byte(0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2); } |
| 527 | } |
| 528 | sub sha256rnds2 { sha256op38(0xcb,@_); } |
| 529 | sub sha256msg1 { sha256op38(0xcc,@_); } |
| 530 | sub sha256msg2 { sha256op38(0xcd,@_); } |
| 531 | |
| 532 | &set_label("shaext",32); |
| 533 | &sub ("esp",32); |
| 534 | |
| 535 | &movdqu ($ABEF,&QWP(0,$ctx)); # DCBA |
| 536 | &lea ($K256,&DWP(0x80,$K256)); |
| 537 | &movdqu ($CDGH,&QWP(16,$ctx)); # HGFE |
| 538 | &movdqa ($TMP,&QWP(0x100-0x80,$K256)); # byte swap mask |
| 539 | |
| 540 | &pshufd ($Wi,$ABEF,0x1b); # ABCD |
| 541 | &pshufd ($ABEF,$ABEF,0xb1); # CDAB |
| 542 | &pshufd ($CDGH,$CDGH,0x1b); # EFGH |
| 543 | &palignr ($ABEF,$CDGH,8); # ABEF |
| 544 | &punpcklqdq ($CDGH,$Wi); # CDGH |
| 545 | &jmp (&label("loop_shaext")); |
| 546 | |
| 547 | &set_label("loop_shaext",16); |
| 548 | &movdqu (@MSG[0],&QWP(0,$inp)); |
| 549 | &movdqu (@MSG[1],&QWP(0x10,$inp)); |
| 550 | &movdqu (@MSG[2],&QWP(0x20,$inp)); |
| 551 | &pshufb (@MSG[0],$TMP); |
| 552 | &movdqu (@MSG[3],&QWP(0x30,$inp)); |
| 553 | &movdqa (&QWP(16,"esp"),$CDGH); # offload |
| 554 | |
| 555 | &movdqa ($Wi,&QWP(0*16-0x80,$K256)); |
| 556 | &paddd ($Wi,@MSG[0]); |
| 557 | &pshufb (@MSG[1],$TMP); |
| 558 | &sha256rnds2 ($CDGH,$ABEF); # 0-3 |
| 559 | &pshufd ($Wi,$Wi,0x0e); |
| 560 | &nop (); |
| 561 | &movdqa (&QWP(0,"esp"),$ABEF); # offload |
| 562 | &sha256rnds2 ($ABEF,$CDGH); |
| 563 | |
| 564 | &movdqa ($Wi,&QWP(1*16-0x80,$K256)); |
| 565 | &paddd ($Wi,@MSG[1]); |
| 566 | &pshufb (@MSG[2],$TMP); |
| 567 | &sha256rnds2 ($CDGH,$ABEF); # 4-7 |
| 568 | &pshufd ($Wi,$Wi,0x0e); |
| 569 | &lea ($inp,&DWP(0x40,$inp)); |
| 570 | &sha256msg1 (@MSG[0],@MSG[1]); |
| 571 | &sha256rnds2 ($ABEF,$CDGH); |
| 572 | |
| 573 | &movdqa ($Wi,&QWP(2*16-0x80,$K256)); |
| 574 | &paddd ($Wi,@MSG[2]); |
| 575 | &pshufb (@MSG[3],$TMP); |
| 576 | &sha256rnds2 ($CDGH,$ABEF); # 8-11 |
| 577 | &pshufd ($Wi,$Wi,0x0e); |
| 578 | &movdqa ($TMP,@MSG[3]); |
| 579 | &palignr ($TMP,@MSG[2],4); |
| 580 | &nop (); |
| 581 | &paddd (@MSG[0],$TMP); |
| 582 | &sha256msg1 (@MSG[1],@MSG[2]); |
| 583 | &sha256rnds2 ($ABEF,$CDGH); |
| 584 | |
| 585 | &movdqa ($Wi,&QWP(3*16-0x80,$K256)); |
| 586 | &paddd ($Wi,@MSG[3]); |
| 587 | &sha256msg2 (@MSG[0],@MSG[3]); |
| 588 | &sha256rnds2 ($CDGH,$ABEF); # 12-15 |
| 589 | &pshufd ($Wi,$Wi,0x0e); |
| 590 | &movdqa ($TMP,@MSG[0]); |
| 591 | &palignr ($TMP,@MSG[3],4); |
| 592 | &nop (); |
| 593 | &paddd (@MSG[1],$TMP); |
| 594 | &sha256msg1 (@MSG[2],@MSG[3]); |
| 595 | &sha256rnds2 ($ABEF,$CDGH); |
| 596 | |
| 597 | for($i=4;$i<16-3;$i++) { |
| 598 | &movdqa ($Wi,&QWP($i*16-0x80,$K256)); |
| 599 | &paddd ($Wi,@MSG[0]); |
| 600 | &sha256msg2 (@MSG[1],@MSG[0]); |
| 601 | &sha256rnds2 ($CDGH,$ABEF); # 16-19... |
| 602 | &pshufd ($Wi,$Wi,0x0e); |
| 603 | &movdqa ($TMP,@MSG[1]); |
| 604 | &palignr ($TMP,@MSG[0],4); |
| 605 | &nop (); |
| 606 | &paddd (@MSG[2],$TMP); |
| 607 | &sha256msg1 (@MSG[3],@MSG[0]); |
| 608 | &sha256rnds2 ($ABEF,$CDGH); |
| 609 | |
| 610 | push(@MSG,shift(@MSG)); |
| 611 | } |
| 612 | &movdqa ($Wi,&QWP(13*16-0x80,$K256)); |
| 613 | &paddd ($Wi,@MSG[0]); |
| 614 | &sha256msg2 (@MSG[1],@MSG[0]); |
| 615 | &sha256rnds2 ($CDGH,$ABEF); # 52-55 |
| 616 | &pshufd ($Wi,$Wi,0x0e); |
| 617 | &movdqa ($TMP,@MSG[1]) |
| 618 | &palignr ($TMP,@MSG[0],4); |
| 619 | &sha256rnds2 ($ABEF,$CDGH); |
| 620 | &paddd (@MSG[2],$TMP); |
| 621 | |
| 622 | &movdqa ($Wi,&QWP(14*16-0x80,$K256)); |
| 623 | &paddd ($Wi,@MSG[1]); |
| 624 | &sha256rnds2 ($CDGH,$ABEF); # 56-59 |
| 625 | &pshufd ($Wi,$Wi,0x0e); |
| 626 | &sha256msg2 (@MSG[2],@MSG[1]); |
| 627 | &movdqa ($TMP,&QWP(0x100-0x80,$K256)); # byte swap mask |
| 628 | &sha256rnds2 ($ABEF,$CDGH); |
| 629 | |
| 630 | &movdqa ($Wi,&QWP(15*16-0x80,$K256)); |
| 631 | &paddd ($Wi,@MSG[2]); |
| 632 | &nop (); |
| 633 | &sha256rnds2 ($CDGH,$ABEF); # 60-63 |
| 634 | &pshufd ($Wi,$Wi,0x0e); |
| 635 | &cmp ($end,$inp); |
| 636 | &nop (); |
| 637 | &sha256rnds2 ($ABEF,$CDGH); |
| 638 | |
| 639 | &paddd ($CDGH,&QWP(16,"esp")); |
| 640 | &paddd ($ABEF,&QWP(0,"esp")); |
| 641 | &jnz (&label("loop_shaext")); |
| 642 | |
| 643 | &pshufd ($CDGH,$CDGH,0xb1); # DCHG |
| 644 | &pshufd ($TMP,$ABEF,0x1b); # FEBA |
| 645 | &pshufd ($ABEF,$ABEF,0xb1); # BAFE |
| 646 | &punpckhqdq ($ABEF,$CDGH); # DCBA |
| 647 | &palignr ($CDGH,$TMP,8); # HGFE |
| 648 | |
| 649 | &mov ("esp",&DWP(32+12,"esp")); |
| 650 | &movdqu (&QWP(0,$ctx),$ABEF); |
| 651 | &movdqu (&QWP(16,$ctx),$CDGH); |
| 652 | &function_end_A(); |
| 653 | } |
| 654 | |
Adam Langley | 95c29f3 | 2014-06-20 12:00:00 -0700 | [diff] [blame] | 655 | my @X = map("xmm$_",(0..3)); |
| 656 | my ($t0,$t1,$t2,$t3) = map("xmm$_",(4..7)); |
| 657 | my @AH = ($A,$T); |
| 658 | |
| 659 | &set_label("SSSE3",32); |
| 660 | &lea ("esp",&DWP(-96,"esp")); |
| 661 | # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack |
| 662 | &mov ($AH[0],&DWP(0,"esi")); |
| 663 | &mov ($AH[1],&DWP(4,"esi")); |
| 664 | &mov ("ecx",&DWP(8,"esi")); |
| 665 | &mov ("edi",&DWP(12,"esi")); |
| 666 | #&mov (&DWP(0,"esp"),$AH[0]); |
| 667 | &mov (&DWP(4,"esp"),$AH[1]); |
| 668 | &xor ($AH[1],"ecx"); # magic |
| 669 | &mov (&DWP(8,"esp"),"ecx"); |
| 670 | &mov (&DWP(12,"esp"),"edi"); |
| 671 | &mov ($E,&DWP(16,"esi")); |
| 672 | &mov ("edi",&DWP(20,"esi")); |
| 673 | &mov ("ecx",&DWP(24,"esi")); |
| 674 | &mov ("esi",&DWP(28,"esi")); |
| 675 | #&mov (&DWP(16,"esp"),$E); |
| 676 | &mov (&DWP(20,"esp"),"edi"); |
| 677 | &mov ("edi",&DWP(96+4,"esp")); # inp |
| 678 | &mov (&DWP(24,"esp"),"ecx"); |
| 679 | &mov (&DWP(28,"esp"),"esi"); |
| 680 | &movdqa ($t3,&QWP(256,$K256)); |
| 681 | &jmp (&label("grand_ssse3")); |
| 682 | |
| 683 | &set_label("grand_ssse3",16); |
| 684 | # load input, reverse byte order, add K256[0..15], save to stack |
| 685 | &movdqu (@X[0],&QWP(0,"edi")); |
| 686 | &movdqu (@X[1],&QWP(16,"edi")); |
| 687 | &movdqu (@X[2],&QWP(32,"edi")); |
| 688 | &movdqu (@X[3],&QWP(48,"edi")); |
| 689 | &add ("edi",64); |
| 690 | &pshufb (@X[0],$t3); |
| 691 | &mov (&DWP(96+4,"esp"),"edi"); |
| 692 | &pshufb (@X[1],$t3); |
| 693 | &movdqa ($t0,&QWP(0,$K256)); |
| 694 | &pshufb (@X[2],$t3); |
| 695 | &movdqa ($t1,&QWP(16,$K256)); |
| 696 | &paddd ($t0,@X[0]); |
| 697 | &pshufb (@X[3],$t3); |
| 698 | &movdqa ($t2,&QWP(32,$K256)); |
| 699 | &paddd ($t1,@X[1]); |
| 700 | &movdqa ($t3,&QWP(48,$K256)); |
| 701 | &movdqa (&QWP(32+0,"esp"),$t0); |
| 702 | &paddd ($t2,@X[2]); |
| 703 | &movdqa (&QWP(32+16,"esp"),$t1); |
| 704 | &paddd ($t3,@X[3]); |
| 705 | &movdqa (&QWP(32+32,"esp"),$t2); |
| 706 | &movdqa (&QWP(32+48,"esp"),$t3); |
| 707 | &jmp (&label("ssse3_00_47")); |
| 708 | |
| 709 | &set_label("ssse3_00_47",16); |
| 710 | &add ($K256,64); |
| 711 | |
| 712 | sub SSSE3_00_47 () { |
| 713 | my $j = shift; |
| 714 | my $body = shift; |
| 715 | my @X = @_; |
| 716 | my @insns = (&$body,&$body,&$body,&$body); # 120 instructions |
| 717 | |
| 718 | eval(shift(@insns)); |
| 719 | &movdqa ($t0,@X[1]); |
| 720 | eval(shift(@insns)); # @ |
| 721 | eval(shift(@insns)); |
| 722 | &movdqa ($t3,@X[3]); |
| 723 | eval(shift(@insns)); |
| 724 | eval(shift(@insns)); |
| 725 | &palignr ($t0,@X[0],4); # X[1..4] |
| 726 | eval(shift(@insns)); |
| 727 | eval(shift(@insns)); # @ |
| 728 | eval(shift(@insns)); |
| 729 | &palignr ($t3,@X[2],4); # X[9..12] |
| 730 | eval(shift(@insns)); |
| 731 | eval(shift(@insns)); |
| 732 | eval(shift(@insns)); |
| 733 | &movdqa ($t1,$t0); |
| 734 | eval(shift(@insns)); # @ |
| 735 | eval(shift(@insns)); |
| 736 | &movdqa ($t2,$t0); |
| 737 | eval(shift(@insns)); |
| 738 | eval(shift(@insns)); |
| 739 | &psrld ($t0,3); |
| 740 | eval(shift(@insns)); |
| 741 | eval(shift(@insns)); # @ |
| 742 | &paddd (@X[0],$t3); # X[0..3] += X[9..12] |
| 743 | eval(shift(@insns)); |
| 744 | eval(shift(@insns)); |
| 745 | &psrld ($t2,7); |
| 746 | eval(shift(@insns)); |
| 747 | eval(shift(@insns)); |
| 748 | eval(shift(@insns)); # @ |
| 749 | eval(shift(@insns)); |
| 750 | &pshufd ($t3,@X[3],0b11111010); # X[14..15] |
| 751 | eval(shift(@insns)); |
| 752 | eval(shift(@insns)); |
| 753 | &pslld ($t1,32-18); |
| 754 | eval(shift(@insns)); |
| 755 | eval(shift(@insns)); # @ |
| 756 | &pxor ($t0,$t2); |
| 757 | eval(shift(@insns)); |
| 758 | eval(shift(@insns)); |
| 759 | &psrld ($t2,18-7); |
| 760 | eval(shift(@insns)); |
| 761 | eval(shift(@insns)); |
| 762 | eval(shift(@insns)); # @ |
| 763 | &pxor ($t0,$t1); |
| 764 | eval(shift(@insns)); |
| 765 | eval(shift(@insns)); |
| 766 | &pslld ($t1,18-7); |
| 767 | eval(shift(@insns)); |
| 768 | eval(shift(@insns)); |
| 769 | eval(shift(@insns)); # @ |
| 770 | &pxor ($t0,$t2); |
| 771 | eval(shift(@insns)); |
| 772 | eval(shift(@insns)); |
| 773 | &movdqa ($t2,$t3); |
| 774 | eval(shift(@insns)); |
| 775 | eval(shift(@insns)); |
| 776 | eval(shift(@insns)); # @ |
| 777 | &pxor ($t0,$t1); # sigma0(X[1..4]) |
| 778 | eval(shift(@insns)); |
| 779 | eval(shift(@insns)); |
| 780 | &psrld ($t3,10); |
| 781 | eval(shift(@insns)); |
| 782 | eval(shift(@insns)); |
| 783 | eval(shift(@insns)); # @ |
| 784 | &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) |
| 785 | eval(shift(@insns)); |
| 786 | eval(shift(@insns)); |
| 787 | &psrlq ($t2,17); |
| 788 | eval(shift(@insns)); |
| 789 | eval(shift(@insns)); |
| 790 | eval(shift(@insns)); # @ |
| 791 | &pxor ($t3,$t2); |
| 792 | eval(shift(@insns)); |
| 793 | eval(shift(@insns)); |
| 794 | &psrlq ($t2,19-17); |
| 795 | eval(shift(@insns)); |
| 796 | eval(shift(@insns)); |
| 797 | eval(shift(@insns)); # @ |
| 798 | &pxor ($t3,$t2); |
| 799 | eval(shift(@insns)); |
| 800 | eval(shift(@insns)); |
| 801 | &pshufd ($t3,$t3,0b10000000); |
| 802 | eval(shift(@insns)); |
| 803 | eval(shift(@insns)); |
| 804 | eval(shift(@insns)); # @ |
| 805 | eval(shift(@insns)); |
| 806 | eval(shift(@insns)); |
| 807 | eval(shift(@insns)); |
| 808 | eval(shift(@insns)); |
| 809 | eval(shift(@insns)); # @ |
| 810 | eval(shift(@insns)); |
| 811 | &psrldq ($t3,8); |
| 812 | eval(shift(@insns)); |
| 813 | eval(shift(@insns)); |
| 814 | eval(shift(@insns)); |
| 815 | &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) |
| 816 | eval(shift(@insns)); # @ |
| 817 | eval(shift(@insns)); |
| 818 | eval(shift(@insns)); |
| 819 | eval(shift(@insns)); |
| 820 | eval(shift(@insns)); |
| 821 | eval(shift(@insns)); # @ |
| 822 | eval(shift(@insns)); |
| 823 | &pshufd ($t3,@X[0],0b01010000); # X[16..17] |
| 824 | eval(shift(@insns)); |
| 825 | eval(shift(@insns)); |
| 826 | eval(shift(@insns)); |
| 827 | &movdqa ($t2,$t3); |
| 828 | eval(shift(@insns)); # @ |
| 829 | &psrld ($t3,10); |
| 830 | eval(shift(@insns)); |
| 831 | &psrlq ($t2,17); |
| 832 | eval(shift(@insns)); |
| 833 | eval(shift(@insns)); |
| 834 | eval(shift(@insns)); |
| 835 | eval(shift(@insns)); # @ |
| 836 | &pxor ($t3,$t2); |
| 837 | eval(shift(@insns)); |
| 838 | eval(shift(@insns)); |
| 839 | &psrlq ($t2,19-17); |
| 840 | eval(shift(@insns)); |
| 841 | eval(shift(@insns)); |
| 842 | eval(shift(@insns)); # @ |
| 843 | &pxor ($t3,$t2); |
| 844 | eval(shift(@insns)); |
| 845 | eval(shift(@insns)); |
| 846 | eval(shift(@insns)); |
| 847 | &pshufd ($t3,$t3,0b00001000); |
| 848 | eval(shift(@insns)); |
| 849 | eval(shift(@insns)); # @ |
| 850 | &movdqa ($t2,&QWP(16*$j,$K256)); |
| 851 | eval(shift(@insns)); |
| 852 | eval(shift(@insns)); |
| 853 | &pslldq ($t3,8); |
| 854 | eval(shift(@insns)); |
| 855 | eval(shift(@insns)); |
| 856 | eval(shift(@insns)); # @ |
| 857 | eval(shift(@insns)); |
| 858 | eval(shift(@insns)); |
| 859 | eval(shift(@insns)); |
| 860 | eval(shift(@insns)); |
| 861 | eval(shift(@insns)); # @ |
| 862 | &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) |
| 863 | eval(shift(@insns)); |
| 864 | eval(shift(@insns)); |
| 865 | eval(shift(@insns)); |
| 866 | eval(shift(@insns)); |
| 867 | &paddd ($t2,@X[0]); |
| 868 | eval(shift(@insns)); # @ |
| 869 | |
| 870 | foreach (@insns) { eval; } # remaining instructions |
| 871 | |
| 872 | &movdqa (&QWP(32+16*$j,"esp"),$t2); |
| 873 | } |
| 874 | |
| 875 | sub body_00_15 () { |
| 876 | ( |
| 877 | '&mov ("ecx",$E);', |
| 878 | '&ror ($E,25-11);', |
| 879 | '&mov ("esi",&off($f));', |
| 880 | '&xor ($E,"ecx");', |
| 881 | '&mov ("edi",&off($g));', |
| 882 | '&xor ("esi","edi");', |
| 883 | '&ror ($E,11-6);', |
| 884 | '&and ("esi","ecx");', |
| 885 | '&mov (&off($e),"ecx");', # save $E, modulo-scheduled |
| 886 | '&xor ($E,"ecx");', |
| 887 | '&xor ("edi","esi");', # Ch(e,f,g) |
| 888 | '&ror ($E,6);', # T = Sigma1(e) |
| 889 | '&mov ("ecx",$AH[0]);', |
| 890 | '&add ($E,"edi");', # T += Ch(e,f,g) |
| 891 | '&mov ("edi",&off($b));', |
| 892 | '&mov ("esi",$AH[0]);', |
| 893 | |
| 894 | '&ror ("ecx",22-13);', |
| 895 | '&mov (&off($a),$AH[0]);', # save $A, modulo-scheduled |
| 896 | '&xor ("ecx",$AH[0]);', |
| 897 | '&xor ($AH[0],"edi");', # a ^= b, (b^c) in next round |
| 898 | '&add ($E,&off($h));', # T += h |
| 899 | '&ror ("ecx",13-2);', |
| 900 | '&and ($AH[1],$AH[0]);', # (b^c) &= (a^b) |
| 901 | '&xor ("ecx","esi");', |
| 902 | '&add ($E,&DWP(32+4*($i&15),"esp"));', # T += K[i]+X[i] |
| 903 | '&xor ($AH[1],"edi");', # h = Maj(a,b,c) = Ch(a^b,c,b) |
| 904 | '&ror ("ecx",2);', # Sigma0(a) |
| 905 | |
| 906 | '&add ($AH[1],$E);', # h += T |
| 907 | '&add ($E,&off($d));', # d += T |
| 908 | '&add ($AH[1],"ecx");'. # h += Sigma0(a) |
| 909 | |
| 910 | '@AH = reverse(@AH); $i++;' # rotate(a,h) |
| 911 | ); |
| 912 | } |
| 913 | |
| 914 | for ($i=0,$j=0; $j<4; $j++) { |
| 915 | &SSSE3_00_47($j,\&body_00_15,@X); |
| 916 | push(@X,shift(@X)); # rotate(@X) |
| 917 | } |
| 918 | &cmp (&DWP(16*$j,$K256),0x00010203); |
| 919 | &jne (&label("ssse3_00_47")); |
| 920 | |
| 921 | for ($i=0; $i<16; ) { |
| 922 | foreach(body_00_15()) { eval; } |
| 923 | } |
| 924 | |
| 925 | &mov ("esi",&DWP(96,"esp")); #ctx |
| 926 | #&mov ($AH[0],&DWP(0,"esp")); |
| 927 | &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); |
| 928 | #&mov ("edi", &DWP(8,"esp")); |
| 929 | &mov ("ecx",&DWP(12,"esp")); |
| 930 | &add ($AH[0],&DWP(0,"esi")); |
| 931 | &add ($AH[1],&DWP(4,"esi")); |
| 932 | &add ("edi",&DWP(8,"esi")); |
| 933 | &add ("ecx",&DWP(12,"esi")); |
| 934 | &mov (&DWP(0,"esi"),$AH[0]); |
| 935 | &mov (&DWP(4,"esi"),$AH[1]); |
| 936 | &mov (&DWP(8,"esi"),"edi"); |
| 937 | &mov (&DWP(12,"esi"),"ecx"); |
| 938 | #&mov (&DWP(0,"esp"),$AH[0]); |
| 939 | &mov (&DWP(4,"esp"),$AH[1]); |
| 940 | &xor ($AH[1],"edi"); # magic |
| 941 | &mov (&DWP(8,"esp"),"edi"); |
| 942 | &mov (&DWP(12,"esp"),"ecx"); |
| 943 | #&mov ($E,&DWP(16,"esp")); |
| 944 | &mov ("edi",&DWP(20,"esp")); |
| 945 | &mov ("ecx",&DWP(24,"esp")); |
| 946 | &add ($E,&DWP(16,"esi")); |
| 947 | &add ("edi",&DWP(20,"esi")); |
| 948 | &add ("ecx",&DWP(24,"esi")); |
| 949 | &mov (&DWP(16,"esi"),$E); |
| 950 | &mov (&DWP(20,"esi"),"edi"); |
| 951 | &mov (&DWP(20,"esp"),"edi"); |
| 952 | &mov ("edi",&DWP(28,"esp")); |
| 953 | &mov (&DWP(24,"esi"),"ecx"); |
| 954 | #&mov (&DWP(16,"esp"),$E); |
| 955 | &add ("edi",&DWP(28,"esi")); |
| 956 | &mov (&DWP(24,"esp"),"ecx"); |
| 957 | &mov (&DWP(28,"esi"),"edi"); |
| 958 | &mov (&DWP(28,"esp"),"edi"); |
| 959 | &mov ("edi",&DWP(96+4,"esp")); # inp |
| 960 | |
| 961 | &movdqa ($t3,&QWP(64,$K256)); |
| 962 | &sub ($K256,3*64); # rewind K |
| 963 | &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? |
| 964 | &jb (&label("grand_ssse3")); |
| 965 | |
| 966 | &mov ("esp",&DWP(96+12,"esp")); # restore sp |
| 967 | &function_end_A(); |
| 968 | if ($avx) { |
| 969 | &set_label("AVX",32); |
| 970 | if ($avx>1) { |
Adam Langley | 95c29f3 | 2014-06-20 12:00:00 -0700 | [diff] [blame] | 971 | &and ("edx",1<<8|1<<3); # check for BMI2+BMI1 |
| 972 | &cmp ("edx",1<<8|1<<3); |
| 973 | &je (&label("AVX_BMI")); |
| 974 | } |
| 975 | &lea ("esp",&DWP(-96,"esp")); |
| 976 | &vzeroall (); |
| 977 | # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack |
| 978 | &mov ($AH[0],&DWP(0,"esi")); |
| 979 | &mov ($AH[1],&DWP(4,"esi")); |
| 980 | &mov ("ecx",&DWP(8,"esi")); |
| 981 | &mov ("edi",&DWP(12,"esi")); |
| 982 | #&mov (&DWP(0,"esp"),$AH[0]); |
| 983 | &mov (&DWP(4,"esp"),$AH[1]); |
| 984 | &xor ($AH[1],"ecx"); # magic |
| 985 | &mov (&DWP(8,"esp"),"ecx"); |
| 986 | &mov (&DWP(12,"esp"),"edi"); |
| 987 | &mov ($E,&DWP(16,"esi")); |
| 988 | &mov ("edi",&DWP(20,"esi")); |
| 989 | &mov ("ecx",&DWP(24,"esi")); |
| 990 | &mov ("esi",&DWP(28,"esi")); |
| 991 | #&mov (&DWP(16,"esp"),$E); |
| 992 | &mov (&DWP(20,"esp"),"edi"); |
| 993 | &mov ("edi",&DWP(96+4,"esp")); # inp |
| 994 | &mov (&DWP(24,"esp"),"ecx"); |
| 995 | &mov (&DWP(28,"esp"),"esi"); |
| 996 | &vmovdqa ($t3,&QWP(256,$K256)); |
| 997 | &jmp (&label("grand_avx")); |
| 998 | |
| 999 | &set_label("grand_avx",32); |
| 1000 | # load input, reverse byte order, add K256[0..15], save to stack |
| 1001 | &vmovdqu (@X[0],&QWP(0,"edi")); |
| 1002 | &vmovdqu (@X[1],&QWP(16,"edi")); |
| 1003 | &vmovdqu (@X[2],&QWP(32,"edi")); |
| 1004 | &vmovdqu (@X[3],&QWP(48,"edi")); |
| 1005 | &add ("edi",64); |
| 1006 | &vpshufb (@X[0],@X[0],$t3); |
| 1007 | &mov (&DWP(96+4,"esp"),"edi"); |
| 1008 | &vpshufb (@X[1],@X[1],$t3); |
| 1009 | &vpshufb (@X[2],@X[2],$t3); |
| 1010 | &vpaddd ($t0,@X[0],&QWP(0,$K256)); |
| 1011 | &vpshufb (@X[3],@X[3],$t3); |
| 1012 | &vpaddd ($t1,@X[1],&QWP(16,$K256)); |
| 1013 | &vpaddd ($t2,@X[2],&QWP(32,$K256)); |
| 1014 | &vpaddd ($t3,@X[3],&QWP(48,$K256)); |
| 1015 | &vmovdqa (&QWP(32+0,"esp"),$t0); |
| 1016 | &vmovdqa (&QWP(32+16,"esp"),$t1); |
| 1017 | &vmovdqa (&QWP(32+32,"esp"),$t2); |
| 1018 | &vmovdqa (&QWP(32+48,"esp"),$t3); |
| 1019 | &jmp (&label("avx_00_47")); |
| 1020 | |
| 1021 | &set_label("avx_00_47",16); |
| 1022 | &add ($K256,64); |
| 1023 | |
| 1024 | sub Xupdate_AVX () { |
| 1025 | ( |
| 1026 | '&vpalignr ($t0,@X[1],@X[0],4);', # X[1..4] |
| 1027 | '&vpalignr ($t3,@X[3],@X[2],4);', # X[9..12] |
| 1028 | '&vpsrld ($t2,$t0,7);', |
| 1029 | '&vpaddd (@X[0],@X[0],$t3);', # X[0..3] += X[9..16] |
| 1030 | '&vpsrld ($t3,$t0,3);', |
| 1031 | '&vpslld ($t1,$t0,14);', |
| 1032 | '&vpxor ($t0,$t3,$t2);', |
| 1033 | '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] |
| 1034 | '&vpsrld ($t2,$t2,18-7);', |
| 1035 | '&vpxor ($t0,$t0,$t1);', |
| 1036 | '&vpslld ($t1,$t1,25-14);', |
| 1037 | '&vpxor ($t0,$t0,$t2);', |
| 1038 | '&vpsrld ($t2,$t3,10);', |
| 1039 | '&vpxor ($t0,$t0,$t1);', # sigma0(X[1..4]) |
| 1040 | '&vpsrlq ($t1,$t3,17);', |
| 1041 | '&vpaddd (@X[0],@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) |
| 1042 | '&vpxor ($t2,$t2,$t1);', |
| 1043 | '&vpsrlq ($t3,$t3,19);', |
| 1044 | '&vpxor ($t2,$t2,$t3);', # sigma1(X[14..15] |
| 1045 | '&vpshufd ($t3,$t2,0b10000100);', |
| 1046 | '&vpsrldq ($t3,$t3,8);', |
| 1047 | '&vpaddd (@X[0],@X[0],$t3);', # X[0..1] += sigma1(X[14..15]) |
| 1048 | '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] |
| 1049 | '&vpsrld ($t2,$t3,10);', |
| 1050 | '&vpsrlq ($t1,$t3,17);', |
| 1051 | '&vpxor ($t2,$t2,$t1);', |
| 1052 | '&vpsrlq ($t3,$t3,19);', |
| 1053 | '&vpxor ($t2,$t2,$t3);', # sigma1(X[16..17] |
| 1054 | '&vpshufd ($t3,$t2,0b11101000);', |
| 1055 | '&vpslldq ($t3,$t3,8);', |
| 1056 | '&vpaddd (@X[0],@X[0],$t3);' # X[2..3] += sigma1(X[16..17]) |
| 1057 | ); |
| 1058 | } |
| 1059 | |
| 1060 | local *ror = sub { &shrd(@_[0],@_) }; |
| 1061 | sub AVX_00_47 () { |
| 1062 | my $j = shift; |
| 1063 | my $body = shift; |
| 1064 | my @X = @_; |
| 1065 | my @insns = (&$body,&$body,&$body,&$body); # 120 instructions |
| 1066 | my $insn; |
| 1067 | |
| 1068 | foreach (Xupdate_AVX()) { # 31 instructions |
| 1069 | eval; |
| 1070 | eval(shift(@insns)); |
| 1071 | eval(shift(@insns)); |
| 1072 | eval($insn = shift(@insns)); |
| 1073 | eval(shift(@insns)) if ($insn =~ /rorx/ && @insns[0] =~ /rorx/); |
| 1074 | } |
| 1075 | &vpaddd ($t2,@X[0],&QWP(16*$j,$K256)); |
| 1076 | foreach (@insns) { eval; } # remaining instructions |
| 1077 | &vmovdqa (&QWP(32+16*$j,"esp"),$t2); |
| 1078 | } |
| 1079 | |
| 1080 | for ($i=0,$j=0; $j<4; $j++) { |
| 1081 | &AVX_00_47($j,\&body_00_15,@X); |
| 1082 | push(@X,shift(@X)); # rotate(@X) |
| 1083 | } |
| 1084 | &cmp (&DWP(16*$j,$K256),0x00010203); |
| 1085 | &jne (&label("avx_00_47")); |
| 1086 | |
| 1087 | for ($i=0; $i<16; ) { |
| 1088 | foreach(body_00_15()) { eval; } |
| 1089 | } |
| 1090 | |
| 1091 | &mov ("esi",&DWP(96,"esp")); #ctx |
| 1092 | #&mov ($AH[0],&DWP(0,"esp")); |
| 1093 | &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); |
| 1094 | #&mov ("edi", &DWP(8,"esp")); |
| 1095 | &mov ("ecx",&DWP(12,"esp")); |
| 1096 | &add ($AH[0],&DWP(0,"esi")); |
| 1097 | &add ($AH[1],&DWP(4,"esi")); |
| 1098 | &add ("edi",&DWP(8,"esi")); |
| 1099 | &add ("ecx",&DWP(12,"esi")); |
| 1100 | &mov (&DWP(0,"esi"),$AH[0]); |
| 1101 | &mov (&DWP(4,"esi"),$AH[1]); |
| 1102 | &mov (&DWP(8,"esi"),"edi"); |
| 1103 | &mov (&DWP(12,"esi"),"ecx"); |
| 1104 | #&mov (&DWP(0,"esp"),$AH[0]); |
| 1105 | &mov (&DWP(4,"esp"),$AH[1]); |
| 1106 | &xor ($AH[1],"edi"); # magic |
| 1107 | &mov (&DWP(8,"esp"),"edi"); |
| 1108 | &mov (&DWP(12,"esp"),"ecx"); |
| 1109 | #&mov ($E,&DWP(16,"esp")); |
| 1110 | &mov ("edi",&DWP(20,"esp")); |
| 1111 | &mov ("ecx",&DWP(24,"esp")); |
| 1112 | &add ($E,&DWP(16,"esi")); |
| 1113 | &add ("edi",&DWP(20,"esi")); |
| 1114 | &add ("ecx",&DWP(24,"esi")); |
| 1115 | &mov (&DWP(16,"esi"),$E); |
| 1116 | &mov (&DWP(20,"esi"),"edi"); |
| 1117 | &mov (&DWP(20,"esp"),"edi"); |
| 1118 | &mov ("edi",&DWP(28,"esp")); |
| 1119 | &mov (&DWP(24,"esi"),"ecx"); |
| 1120 | #&mov (&DWP(16,"esp"),$E); |
| 1121 | &add ("edi",&DWP(28,"esi")); |
| 1122 | &mov (&DWP(24,"esp"),"ecx"); |
| 1123 | &mov (&DWP(28,"esi"),"edi"); |
| 1124 | &mov (&DWP(28,"esp"),"edi"); |
| 1125 | &mov ("edi",&DWP(96+4,"esp")); # inp |
| 1126 | |
| 1127 | &vmovdqa ($t3,&QWP(64,$K256)); |
| 1128 | &sub ($K256,3*64); # rewind K |
| 1129 | &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? |
| 1130 | &jb (&label("grand_avx")); |
| 1131 | |
| 1132 | &mov ("esp",&DWP(96+12,"esp")); # restore sp |
| 1133 | &vzeroall (); |
| 1134 | &function_end_A(); |
| 1135 | if ($avx>1) { |
| 1136 | sub bodyx_00_15 () { # +10% |
| 1137 | ( |
| 1138 | '&rorx ("ecx",$E,6)', |
| 1139 | '&rorx ("esi",$E,11)', |
| 1140 | '&mov (&off($e),$E)', # save $E, modulo-scheduled |
| 1141 | '&rorx ("edi",$E,25)', |
| 1142 | '&xor ("ecx","esi")', |
| 1143 | '&andn ("esi",$E,&off($g))', |
| 1144 | '&xor ("ecx","edi")', # Sigma1(e) |
| 1145 | '&and ($E,&off($f))', |
| 1146 | '&mov (&off($a),$AH[0]);', # save $A, modulo-scheduled |
| 1147 | '&or ($E,"esi")', # T = Ch(e,f,g) |
| 1148 | |
| 1149 | '&rorx ("edi",$AH[0],2)', |
| 1150 | '&rorx ("esi",$AH[0],13)', |
| 1151 | '&lea ($E,&DWP(0,$E,"ecx"))', # T += Sigma1(e) |
| 1152 | '&rorx ("ecx",$AH[0],22)', |
| 1153 | '&xor ("esi","edi")', |
| 1154 | '&mov ("edi",&off($b))', |
| 1155 | '&xor ("ecx","esi")', # Sigma0(a) |
| 1156 | |
| 1157 | '&xor ($AH[0],"edi")', # a ^= b, (b^c) in next round |
| 1158 | '&add ($E,&off($h))', # T += h |
| 1159 | '&and ($AH[1],$AH[0])', # (b^c) &= (a^b) |
| 1160 | '&add ($E,&DWP(32+4*($i&15),"esp"))', # T += K[i]+X[i] |
| 1161 | '&xor ($AH[1],"edi")', # h = Maj(a,b,c) = Ch(a^b,c,b) |
| 1162 | |
| 1163 | '&add ("ecx",$E)', # h += T |
| 1164 | '&add ($E,&off($d))', # d += T |
| 1165 | '&lea ($AH[1],&DWP(0,$AH[1],"ecx"));'. # h += Sigma0(a) |
| 1166 | |
| 1167 | '@AH = reverse(@AH); $i++;' # rotate(a,h) |
| 1168 | ); |
| 1169 | } |
| 1170 | |
| 1171 | &set_label("AVX_BMI",32); |
| 1172 | &lea ("esp",&DWP(-96,"esp")); |
| 1173 | &vzeroall (); |
| 1174 | # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack |
| 1175 | &mov ($AH[0],&DWP(0,"esi")); |
| 1176 | &mov ($AH[1],&DWP(4,"esi")); |
| 1177 | &mov ("ecx",&DWP(8,"esi")); |
| 1178 | &mov ("edi",&DWP(12,"esi")); |
| 1179 | #&mov (&DWP(0,"esp"),$AH[0]); |
| 1180 | &mov (&DWP(4,"esp"),$AH[1]); |
| 1181 | &xor ($AH[1],"ecx"); # magic |
| 1182 | &mov (&DWP(8,"esp"),"ecx"); |
| 1183 | &mov (&DWP(12,"esp"),"edi"); |
| 1184 | &mov ($E,&DWP(16,"esi")); |
| 1185 | &mov ("edi",&DWP(20,"esi")); |
| 1186 | &mov ("ecx",&DWP(24,"esi")); |
| 1187 | &mov ("esi",&DWP(28,"esi")); |
| 1188 | #&mov (&DWP(16,"esp"),$E); |
| 1189 | &mov (&DWP(20,"esp"),"edi"); |
| 1190 | &mov ("edi",&DWP(96+4,"esp")); # inp |
| 1191 | &mov (&DWP(24,"esp"),"ecx"); |
| 1192 | &mov (&DWP(28,"esp"),"esi"); |
| 1193 | &vmovdqa ($t3,&QWP(256,$K256)); |
| 1194 | &jmp (&label("grand_avx_bmi")); |
| 1195 | |
| 1196 | &set_label("grand_avx_bmi",32); |
| 1197 | # load input, reverse byte order, add K256[0..15], save to stack |
| 1198 | &vmovdqu (@X[0],&QWP(0,"edi")); |
| 1199 | &vmovdqu (@X[1],&QWP(16,"edi")); |
| 1200 | &vmovdqu (@X[2],&QWP(32,"edi")); |
| 1201 | &vmovdqu (@X[3],&QWP(48,"edi")); |
| 1202 | &add ("edi",64); |
| 1203 | &vpshufb (@X[0],@X[0],$t3); |
| 1204 | &mov (&DWP(96+4,"esp"),"edi"); |
| 1205 | &vpshufb (@X[1],@X[1],$t3); |
| 1206 | &vpshufb (@X[2],@X[2],$t3); |
| 1207 | &vpaddd ($t0,@X[0],&QWP(0,$K256)); |
| 1208 | &vpshufb (@X[3],@X[3],$t3); |
| 1209 | &vpaddd ($t1,@X[1],&QWP(16,$K256)); |
| 1210 | &vpaddd ($t2,@X[2],&QWP(32,$K256)); |
| 1211 | &vpaddd ($t3,@X[3],&QWP(48,$K256)); |
| 1212 | &vmovdqa (&QWP(32+0,"esp"),$t0); |
| 1213 | &vmovdqa (&QWP(32+16,"esp"),$t1); |
| 1214 | &vmovdqa (&QWP(32+32,"esp"),$t2); |
| 1215 | &vmovdqa (&QWP(32+48,"esp"),$t3); |
| 1216 | &jmp (&label("avx_bmi_00_47")); |
| 1217 | |
| 1218 | &set_label("avx_bmi_00_47",16); |
| 1219 | &add ($K256,64); |
| 1220 | |
| 1221 | for ($i=0,$j=0; $j<4; $j++) { |
| 1222 | &AVX_00_47($j,\&bodyx_00_15,@X); |
| 1223 | push(@X,shift(@X)); # rotate(@X) |
| 1224 | } |
| 1225 | &cmp (&DWP(16*$j,$K256),0x00010203); |
| 1226 | &jne (&label("avx_bmi_00_47")); |
| 1227 | |
| 1228 | for ($i=0; $i<16; ) { |
| 1229 | foreach(bodyx_00_15()) { eval; } |
| 1230 | } |
| 1231 | |
| 1232 | &mov ("esi",&DWP(96,"esp")); #ctx |
| 1233 | #&mov ($AH[0],&DWP(0,"esp")); |
| 1234 | &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); |
| 1235 | #&mov ("edi", &DWP(8,"esp")); |
| 1236 | &mov ("ecx",&DWP(12,"esp")); |
| 1237 | &add ($AH[0],&DWP(0,"esi")); |
| 1238 | &add ($AH[1],&DWP(4,"esi")); |
| 1239 | &add ("edi",&DWP(8,"esi")); |
| 1240 | &add ("ecx",&DWP(12,"esi")); |
| 1241 | &mov (&DWP(0,"esi"),$AH[0]); |
| 1242 | &mov (&DWP(4,"esi"),$AH[1]); |
| 1243 | &mov (&DWP(8,"esi"),"edi"); |
| 1244 | &mov (&DWP(12,"esi"),"ecx"); |
| 1245 | #&mov (&DWP(0,"esp"),$AH[0]); |
| 1246 | &mov (&DWP(4,"esp"),$AH[1]); |
| 1247 | &xor ($AH[1],"edi"); # magic |
| 1248 | &mov (&DWP(8,"esp"),"edi"); |
| 1249 | &mov (&DWP(12,"esp"),"ecx"); |
| 1250 | #&mov ($E,&DWP(16,"esp")); |
| 1251 | &mov ("edi",&DWP(20,"esp")); |
| 1252 | &mov ("ecx",&DWP(24,"esp")); |
| 1253 | &add ($E,&DWP(16,"esi")); |
| 1254 | &add ("edi",&DWP(20,"esi")); |
| 1255 | &add ("ecx",&DWP(24,"esi")); |
| 1256 | &mov (&DWP(16,"esi"),$E); |
| 1257 | &mov (&DWP(20,"esi"),"edi"); |
| 1258 | &mov (&DWP(20,"esp"),"edi"); |
| 1259 | &mov ("edi",&DWP(28,"esp")); |
| 1260 | &mov (&DWP(24,"esi"),"ecx"); |
| 1261 | #&mov (&DWP(16,"esp"),$E); |
| 1262 | &add ("edi",&DWP(28,"esi")); |
| 1263 | &mov (&DWP(24,"esp"),"ecx"); |
| 1264 | &mov (&DWP(28,"esi"),"edi"); |
| 1265 | &mov (&DWP(28,"esp"),"edi"); |
| 1266 | &mov ("edi",&DWP(96+4,"esp")); # inp |
| 1267 | |
| 1268 | &vmovdqa ($t3,&QWP(64,$K256)); |
| 1269 | &sub ($K256,3*64); # rewind K |
| 1270 | &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? |
| 1271 | &jb (&label("grand_avx_bmi")); |
| 1272 | |
| 1273 | &mov ("esp",&DWP(96+12,"esp")); # restore sp |
| 1274 | &vzeroall (); |
| 1275 | &function_end_A(); |
| 1276 | } |
| 1277 | } |
| 1278 | }}} |
| 1279 | &function_end_B("sha256_block_data_order"); |
| 1280 | |
| 1281 | &asm_finish(); |
David Benjamin | fdd8e9c | 2016-06-26 13:18:50 -0400 | [diff] [blame] | 1282 | |
| 1283 | close STDOUT; |